Beispiel #1
0
    def __call__(path=None,
                 dataset=None,
                 revision='HEAD',
                 staged=False,
                 ignore_subdatasets='none',
                 recursive=False,
                 recursion_limit=None):
        if not dataset and not path:
            # act on the whole dataset if nothing else was specified
            dataset = curdir
        refds_path = Interface.get_refds_path(dataset)
        if not (refds_path or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")

        to_process = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='diff',
                # unavailable is OK, because we might query for a deleted file
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                ap['process_content'] = True
            to_process.append(ap)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            for r in _parse_git_diff(ds_path,
                                     diff_thingie=revision,
                                     paths=content_by_ds[ds_path],
                                     ignore_submodules=ignore_subdatasets,
                                     staged=staged):
                r.update(dict(action='diff', refds=refds_path), logger=lgr)
                if 'status' not in r:
                    r['status'] = 'ok'
                yield r
Beispiel #2
0
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=False,
        #git_opts=None,
        #annex_opts=None,
        #annex_get_opts=None,
        jobs='auto',
        verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get(
                    'raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap,
                           status='notneeded',
                           logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath,
                      ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(ds,
                                                      ap['path'],
                                                      reckless,
                                                      refds_path,
                                                      description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset'
                        and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible', message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get(
                        'raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] ==
                                'dataset' else ap['parentds'])
                lgr.info("Installing %s%s recursively", subds,
                         (" underneath %s" %
                          ap['path'] if subds.path != ap['path'] else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset'
                            and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok'
                            and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {
                        k: v
                        for k, v in res.items() if not k == 'status'
                    }
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert (not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content,
                        status='notneeded',
                        message="no dataset annex, content already present",
                        action='get',
                        logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(content,
                                   options=['--from=%s' %
                                            source] if source else [],
                                   jobs=jobs):
                res = annexjson2result(res,
                                       ds,
                                       type='file',
                                       logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Beispiel #3
0
    def __call__(
            path=None,
            dataset=None,
            revision=None,
            staged=False,
            ignore_subdatasets='none',
            report_untracked='normal',
            recursive=False,
            recursion_limit=None):
        if not dataset and not path:
            # act on the whole dataset if nothing else was specified
            dataset = curdir
        refds_path = Interface.get_refds_path(dataset)

        to_process = []
        # tracked what commit ranges we want to diff per dataset
        ds_diffies = {}
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='diff',
                # unavailable is OK, because we might query for a deleted file
                unavailable_path_status='',
                nondataset_path_status='impossible',
                # must not use `modified`, infinite loop otherwise
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                ap['process_content'] = True
            if ap.get('raw_input', False) or ap['path'] == refds_path:
                # prepopulate the revision specs for all input paths
                ds_diffies[ap['path']
                           if ap.get('type', None) == 'dataset'
                           else ap['parentds']] = revision
            to_process.append(ap)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        for ds_path in sorted(content_by_ds.keys()):
            if ds_path not in ds_diffies:
                # we don't know how to diff
                # this was not neither an input path, not did we see it
                # when diffing its parent
                continue
            content_paths = content_by_ds[ds_path]
            revision = ds_diffies[ds_path]
            for r in _parse_git_diff(
                    ds_path,
                    diff_thingie=ds_diffies[ds_path],
                    paths=content_paths,
                    ignore_submodules=ignore_subdatasets,
                    staged=staged):
                r.update(dict(
                    action='diff',
                    logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                if r.get('type', None) == 'dataset':
                    # this is a subdataset report
                    # we need to use the reported commit range to properly adjust the
                    # query once we hit that subdataset
                    from_rev = r.get('revision_src', '')
                    to_rev = r.get('revision', '')
                    subrev = '{}..{}'.format(
                        from_rev if from_rev else PRE_INIT_COMMIT_SHA,
                        to_rev if to_rev else '',
                    )
                    if from_rev and from_rev == to_rev:
                        # this is a special case, where subdataset reported changes without
                        # a change in state/commit -- this is code for uncommited changes
                        # in the subdataset (including staged ones). In such a case, we
                        # must not provide a diff range, but only the source commit we want
                        # to diff against
                        # XXX if this is changed, likely the same logic in annotate_paths needs
                        # changing too!
                        subrev = from_rev
                    ds_diffies[r['path']] = subrev
                yield r
            if (revision and '..' in revision) or report_untracked == 'no':
                # don't look for untracked content, we got a revision range
                continue
            for r in _get_untracked_content(
                    ds_path,
                    report_untracked,
                    paths=content_paths):
                r.update(dict(
                    action='diff',
                    logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                yield r
Beispiel #4
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 check=True,
                 if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state',
                                                            None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
Beispiel #5
0
    def __call__(message=None,
                 files=None,
                 dataset=None,
                 all_updated=True,
                 all_changes=None,
                 version_tag=None,
                 recursive=False,
                 recursion_limit=None,
                 super_datasets=False):
        if all_changes is not None:
            from datalad.support.exceptions import DeprecatedError
            raise DeprecatedError(
                new="all_updated option where fits and/or datalad add",
                version="0.5.0",
                msg="RF: all_changes option passed to the save")
        if not dataset and not files:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # we verify that there is an actual repo next
            dataset = abspath(curdir)
        refds_path = Interface.get_refds_path(dataset)

        to_process = []
        for ap in AnnotatePaths.__call__(
                path=files,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [
                Dataset(ap.get('parentds', ap['path'])) for ap in to_process
            ]
            superdss = [ds.get_superdataset(topmost=True) for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path:
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap])
            }
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(
                    dict(path=subds, parentds=parentds, type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(
                    dict(
                        path=parentds,
                        type='dataset',
                        # make sure we save content of superds later on
                        process_content=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(ds,
                                       content_by_ds[dspath],
                                       message=message,
                                       version_tag=version_tag)
            if saved_state:
                res['status'] = 'ok'
            else:
                res['status'] = 'notneeded'
            yield res
Beispiel #6
0
    def __call__(path=None,
                 dataset=None,
                 revision=None,
                 staged=False,
                 ignore_subdatasets='none',
                 report_untracked='normal',
                 recursive=False,
                 recursion_limit=None):
        if not dataset and not path:
            # act on the whole dataset if nothing else was specified
            dataset = curdir
        refds_path = Interface.get_refds_path(dataset)

        to_process = []
        # tracked what commit ranges we want to diff per dataset
        ds_diffies = {}
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='diff',
                # unavailable is OK, because we might query for a deleted file
                unavailable_path_status='',
                nondataset_path_status='impossible',
                # must not use `modified`, infinite loop otherwise
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                ap['process_content'] = True
            if ap.get('raw_input', False) or ap['path'] == refds_path:
                # prepopulate the revision specs for all input paths
                ds_diffies[ap['path'] if ap.get('type', None) ==
                           'dataset' else ap['parentds']] = revision
            to_process.append(ap)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            if ds_path not in ds_diffies:
                # we don't know how to diff
                # this was not neither an input path, not did we see it
                # when diffing its parent
                continue
            content_paths = content_by_ds[ds_path]
            revision = ds_diffies[ds_path]
            for r in _parse_git_diff(ds_path,
                                     diff_thingie=ds_diffies[ds_path],
                                     paths=content_paths,
                                     ignore_submodules=ignore_subdatasets,
                                     staged=staged):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                if r.get('type', None) == 'dataset':
                    # this is a subdataset report
                    # we need to use the reported commit range to properly adjust the
                    # query once we hit that subdataset
                    from_rev = r.get('revision_src', '')
                    to_rev = r.get('revision', '')
                    subrev = '{}..{}'.format(
                        from_rev if from_rev else PRE_INIT_COMMIT_SHA,
                        to_rev if to_rev else '',
                    )
                    if from_rev and from_rev == to_rev:
                        # this is a special case, where subdataset reported changes without
                        # a change in state/commit -- this is code for uncommited changes
                        # in the subdataset (including staged ones). In such a case, we
                        # must not provide a diff range, but only the source commit we want
                        # to diff against
                        # XXX if this is changed, likely the same logic in annotate_paths needs
                        # changing too!
                        subrev = from_rev
                    ds_diffies[r['path']] = subrev
                yield r
            if (revision and '..' in revision) or report_untracked == 'no':
                # don't look for untracked content, we got a revision range
                continue
            for r in _get_untracked_content(ds_path,
                                            report_untracked,
                                            paths=content_paths):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                yield r
Beispiel #7
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            check=True,
            if_dirty='save-before'):

        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `drop`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='drop', logger=lgr, refds=refds_path)
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path
        to_drop = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='drop',
                # justification for status:
                # content need not be dropped where there is none
                unavailable_path_status='notneeded',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                ap['process_content'] = True
            if ap.get('registered_subds', False) and ap.get('state', None) == 'absent':
                # nothing to drop in an absent subdataset, don't be annoying
                # and skip silently
                continue
            to_drop.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_drop,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, order doesn't matter
        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            # TODO generator
            # this should yield what it did
            handle_dirty_dataset(ds, mode=if_dirty)
            # ignore submodule entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                continue
            for r in _drop_files(ds, content, check=check, **res_kwargs):
                yield r
Beispiel #8
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
            # TODO check if next isn't covered by discover_dataset_trace_to_targets already??
            if dataset and ap.get('type', None) == 'dataset':
                # duplicates not possible, annotated_paths returns unique paths
                subds_to_add[ap['path']] = ap
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                # check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                if not subds.repo.get_hexsha():
                    yield get_status_dict(
                        ds=subds,
                        status='impossible',
                        message='cannot add subdataset with no commits',
                        **dict(common_report, **ap))
                    continue
                subds_relpath = relpath(ap['path'], ds_path)
                # make an attempt to configure a submodule source URL based on the
                # discovered remote configuration
                remote, branch = subds.repo.get_tracking_branch()
                subds_url = subds.repo.get_remote_url(
                    remote) if remote else None
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath,
                                          url=subds_url,
                                          name=None)
                except CommandError as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=e.stderr,
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                _fixup_submodule_dotgit_setup(ds, subds_relpath)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add(list(torepoadd.keys()),
                                git=to_git if is_annex else True,
                                **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #9
0
    def __call__(message=None,
                 path=None,
                 dataset=None,
                 all_updated=True,
                 version_tag=None,
                 recursive=False,
                 recursion_limit=None,
                 super_datasets=False,
                 message_file=None):
        if not dataset and not path:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # make sure we don't treat this as a user-provided '.' argument
            path = [{'path': abspath(curdir), 'raw_input': False}]

        refds_path = Interface.get_refds_path(dataset)

        if message and message_file:
            yield get_status_dict(
                'save',
                status='error',
                path=refds_path,
                message="Both a message and message file were specified",
                logger=lgr)
            return

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        to_process = []
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                modified='HEAD' if not path and recursive else None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('state',
                      None) == 'untracked' and not ap.get('raw_input', False):
                # this path was found untracked, but not explicitly given to save
                # we will silently ignore this
                continue
            got_nothing = False
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)
        lgr.log(2, "save, to_process=%r", to_process)
        if got_nothing and recursive and refds_path:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('save',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [
                Dataset(ap.get('parentds', ap['path'])) for ap in to_process
            ]
            superdss = [ds.get_superdataset(topmost=True) for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path:
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap])
            }
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique(
                    [ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(
                    dict(path=subds, parentds=parentds, type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(
                    dict(
                        path=parentds,
                        type='dataset',
                        # make sure we save content of superds later on
                        process_content=True,
                        # but not do nasty things, like adding untracked content
                        # just because we discovered this dataset
                        process_updated_only=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(ds,
                                       content_by_ds[dspath],
                                       message=message)
            res['status'] = 'ok' if saved_state else 'notneeded'
            # MIH: let's tag even if there was nothing commit. I'd forget this
            # option too often...
            if version_tag:
                try:
                    # TODO: check whether comment below is still true after
                    # removing the log swallowing:
                    # again cannot help but force-silence low-level code, because
                    # it screams like a made man instead of allowing top-level
                    # code an orderly error report
                    ds.repo.tag(version_tag)
                    # even if we haven't saved anything
                    res['status'] = 'ok'
                    yield res
                except CommandError as e:
                    if saved_state:
                        # first we yield the result for the actual save
                        yield res
                    # and now complain that tagging didn't work
                    yield get_status_dict(
                        'save',
                        ds=ds,
                        logger=lgr,
                        status='error',
                        message=('cannot tag this version: %s',
                                 e.stderr.strip()))
            else:
                yield res
Beispiel #10
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            save=True,
            message=None,
            if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type', None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = dict()
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds, check=check, has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                ap['path'],
                                dataset=refds_path, recursive=recursive, check=check,
                                if_dirty=if_dirty, result_xfm=None, result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'], start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(dict(
                            path=opj(parentds.path, '.gitmodules'),
                            parents=parentds.path,
                            type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove[ap['path']] = ap
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, list(to_reporemove),
                                         check=True):
                        if r['status'] == 'error':
                            # if drop errored on that path, we can't remove it
                            to_reporemove.pop(r['path'], 'avoidKeyError')
                        yield r

                if to_reporemove:
                    for r in ds.repo.remove(list(to_reporemove), r=True):
                        # these were removed, but we still need to save the
                        # removal

                        r_abs = opj(ds.path, r)
                        if r_abs in to_reporemove:
                            ap = to_reporemove[r_abs]
                        else:
                            ap = {'path': r_abs,
                                  'parentds': ds.path,
                                  'refds': refds_path
                                  }
                        ap['unavailable_path_status'] = ''
                        to_save.append(ap)
                        yield get_status_dict(
                            status='ok',
                            path=r,
                            **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                path=[ap["path"] for ap in to_save],
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path
                        if (refds_path and GitRepo.is_valid_repo(refds_path))
                        else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #11
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            # no annex, no unlock:
            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # direct mode, no unlock:
            elif ds.repo.is_direct_mode():
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "direct mode, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # only files in annex with their content present:
            files = [ap['path'] for ap in content]
            to_unlock = []
            for ap, under_annex, has_content in \
                zip(content,
                    ds.repo.is_under_annex(files),
                    ds.repo.file_has_content(files)):

                # TODO: what about directories? Make sure, there is no
                # situation like no file beneath with content or everything in
                # git, that leads to a CommandError
                # For now pass to annex:
                from os.path import isdir
                if isdir(ap['path']):
                    to_unlock.append(ap)
                    continue

                # Note, that `file_has_content` is (planned to report) True on
                # files in git. Therefore order matters: First check for annex!
                if under_annex:
                    if has_content:
                        to_unlock.append(ap)
                    # no content, no unlock:
                    else:
                        ap['status'] = 'impossible'
                        ap['message'] = "no content present, can't unlock"
                        ap.update(res_kwargs)
                        yield ap
                # file in git, no unlock:
                else:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not controlled by annex, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap

            # don't call annex-unlock with no path, if this is this case because
            # nothing survived the filtering above
            if content and not to_unlock:
                continue

            for r in ds.repo.unlock([ap['path'] for ap in to_unlock]):
                yield get_status_dict(
                    path=opj(ds.path, r),
                    status='ok',
                    type='file',
                    **res_kwargs)
Beispiel #12
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            # no annex, no unlock:
            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # direct mode, no unlock:
            elif ds.repo.is_direct_mode():
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "direct mode, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # only files in annex with their content present:
            files = [ap['path'] for ap in content]
            to_unlock = []
            for ap, under_annex, has_content in \
                zip(content,
                    ds.repo.is_under_annex(files),
                    ds.repo.file_has_content(files)):

                # TODO: what about directories? Make sure, there is no
                # situation like no file beneath with content or everything in
                # git, that leads to a CommandError
                # For now pass to annex:
                from os.path import isdir
                if isdir(ap['path']):
                    to_unlock.append(ap)
                    continue

                # Note, that `file_has_content` is (planned to report) True on
                # files in git. Therefore order matters: First check for annex!
                if under_annex:
                    if has_content:
                        to_unlock.append(ap)
                    # no content, no unlock:
                    else:
                        ap['status'] = 'impossible'
                        ap['message'] = "no content present, can't unlock"
                        ap.update(res_kwargs)
                        yield ap
                # file in git, no unlock:
                else:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not controlled by annex, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap

            # don't call annex-unlock with no path, if this is this case because
            # nothing survived the filtering above
            if content and not to_unlock:
                continue

            for r in ds.repo.unlock([ap['path'] for ap in to_unlock]):
                yield get_status_dict(
                    path=r, status='ok', type='file', **res_kwargs)
Beispiel #13
0
    def __call__(path=None,
                 dataset=None,
                 add=None,
                 init=None,
                 remove=None,
                 reset=None,
                 define_key=None,
                 dataset_global=False,
                 recursive=False,
                 recursion_limit=None):
        # bring metadataset setter args in shape first
        untag, remove = _parse_argspec(remove)
        purge, reset = _parse_argspec(reset)
        tag_add, add = _parse_argspec(add)
        tag_init, init = _parse_argspec(init)
        define_key = dict(define_key) if define_key else None
        # merge all potential sources of tag specifications
        all_untag = remove.get('tag', []) + untag
        if all_untag:
            remove['tag'] = all_untag
        all_addtag = add.get('tag', []) + tag_add
        if all_addtag:
            add['tag'] = all_addtag
        all_inittag = init.get('tag', []) + tag_init
        if all_inittag:
            init['tag'] = all_inittag

        lgr.debug("Will 'init' metadata items: %s", init)
        lgr.debug("Will 'add' metadata items: %s", add)
        lgr.debug("Will 'remove' metadata items: %s", remove)
        lgr.debug("Will 'reset' metadata items: %s", reset)
        lgr.debug("Will 'purge' metadata items: %s", purge)

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(dataset=refds_path,
                                         path=path,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         action='metadata',
                                         unavailable_path_status='error',
                                         nondataset_path_status='error',
                                         force_subds_discovery=False,
                                         return_type='generator',
                                         on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                if ap.get('state', None) == 'absent':
                    # just discovered via recursion, but not relevant here
                    continue
                if GitRepo.is_valid_repo(ap['path']):
                    ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        to_save = []
        for ds_path in content_by_ds:
            # ignore submodule entries
            content = [
                ap for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds_path
            ]
            if not content:
                # nothing other than subdatasets were given or discovered in
                # this dataset, ignore
                continue
            ds = Dataset(ds_path)
            if dataset_global or define_key:
                db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json')
                db = {}
                if exists(db_path):
                    db_fp = open(db_path)
                    # need to read manually, load() would puke on an empty file
                    db_content = db_fp.read()
                    # minimize time for collision
                    db_fp.close()
                    if db_content:
                        db = json.loads(db_content)
                # TODO make manipulation order identical to what git-annex does
                for k, v in init.items() if init else []:
                    if k not in db:
                        db[k] = v
                for k in purge:
                    if k in db:
                        del db[k]
                for k, v in reset.items():
                    db[k] = v
                for k, v in add.items():
                    db[k] = sorted(unique(db.get(k, []) + v))
                for k, v in remove.items():
                    existing_data = db.get(k, [])
                    if isinstance(existing_data, dict):
                        db[k] = {
                            dk: existing_data[dk]
                            for dk in set(existing_data).difference(v)
                        }
                    else:
                        db[k] = list(set(existing_data).difference(v))
                    # wipe out if empty
                    if not db[k]:
                        del db[k]

                added_def = False
                if define_key:
                    defs = db.get('definition', {})
                    for k, v in define_key.items():
                        if k in defs:
                            if not defs[k] == v:
                                yield get_status_dict(
                                    status='error',
                                    ds=ds,
                                    message=
                                    ("conflicting definition for key '%s': '%s' != '%s'",
                                     k, v, defs[k]),
                                    **res_kwargs)
                                continue
                        else:
                            defs[k] = v
                            added_def = True
                    db['definition'] = defs
                # store, if there is anything
                if db:
                    if not exists(dirname(db_path)):
                        makedirs(dirname(db_path))
                    db_fp = open(db_path, 'w')
                    # produce relatively compact, but also diff-friendly format
                    json.dump(db,
                              db_fp,
                              indent=0,
                              separators=(',', ':\n'),
                              sort_keys=True)
                    # minimize time for collision
                    db_fp.close()
                    # use add not save to also cover case of a fresh file
                    ds.add(db_path, save=False)
                    to_save.append(
                        dict(path=db_path, parentds=ds.path, type='file'))
                elif exists(db_path):
                    # no metadata left, kill file
                    ds.remove(db_path)
                    to_save.append(dict(path=ds.path, type='dataset'))
                if added_def or init or add or remove or reset or purge:
                    # if anything happended or could have happended
                    yield get_status_dict(status='ok',
                                          ds=ds,
                                          metadata=db,
                                          **res_kwargs)
            elif not isinstance(ds.repo, AnnexRepo):
                # report on all explicitly requested paths only
                for ap in [c for c in content if ap.get('raw_input', False)]:
                    yield dict(
                        ap,
                        status='impossible',
                        message=(
                            'non-annex dataset %s has no file metadata support',
                            ds),
                        **res_kwargs)
                continue
            ds_paths = [p['path'] for p in content]
            if not dataset_global:
                if reset or purge or add or init or remove:
                    # file metadata manipulation
                    mod_paths = []
                    for mp in ds.repo.set_metadata(
                            ds_paths,
                            reset=reset,
                            add=add,
                            init=init,
                            remove=remove,
                            purge=purge,
                            # we always go recursive
                            # TODO is that a good thing? But how to otherwise distinuish
                            # this kind of recursive from the one across datasets in
                            # the API?
                            recursive=True):
                        if mp.get('success', False):
                            mod_paths.append(mp['file'])
                        else:
                            yield get_status_dict(
                                status='error',
                                message='setting metadata failed',
                                path=opj(ds.path, mp[0]),
                                type='file',
                                **res_kwargs)
                    # query the actually modified paths only
                    ds_paths = mod_paths

                # and lastly, query -- even if we set before -- there could
                # be side-effect from multiple set paths on an individual
                # path, hence we need to query to get the final result
                for file, meta in ds.repo.get_metadata(ds_paths):
                    r = get_status_dict(status='ok',
                                        path=opj(ds.path, file),
                                        type='file',
                                        metadata=meta,
                                        **res_kwargs)
                    yield r
        # save potential modifications to dataset global metadata
        if not to_save:
            return
        for res in Save.__call__(path=to_save,
                                 dataset=refds_path,
                                 message='[DATALAD] dataset metadata update',
                                 return_type='generator',
                                 result_xfm=None,
                                 result_filter=None,
                                 on_failure='ignore'):
            yield res
Beispiel #14
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            description=None,
            reckless=False,
            #git_opts=None,
            #annex_opts=None,
            #annex_get_opts=None,
            jobs='auto',
            verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get('raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap, status='notneeded', logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath, ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(
                    ds, ap['path'], reckless, refds_path, description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible',
                          message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds'])
                lgr.info(
                    "Installing %s%s recursively",
                    subds,
                    (" underneath %s" % ap['path']
                     if subds.path != ap['path']
                     else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok' and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {k: v for k, v in res.items()
                              if not k == 'status'}
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert(not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content, status='notneeded',
                        message="no dataset annex, content already present",
                        action='get', logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(
                    content,
                    options=['--from=%s' % source] if source else [],
                    jobs=jobs):
                res = annexjson2result(res, ds, type='file', logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Beispiel #15
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            save=True,
            message=None,
            if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type', None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = dict()
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds, check=check, has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                # use annotate path as input, but pass a copy because
                                # we cannot rely on it being unaltered by reannotation
                                # TODO maybe adjust annotate_path to do that
                                [ap.copy()],
                                dataset=refds_path, recursive=recursive, check=check,
                                if_dirty=if_dirty, result_xfm=None, result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'], start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(dict(
                            path=opj(parentds.path, '.gitmodules'),
                            parents=parentds.path,
                            type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove[ap['path']] = ap
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, list(to_reporemove),
                                         check=True):
                        if r['status'] == 'error':
                            # if drop errored on that path, we can't remove it
                            to_reporemove.pop(r['path'], 'avoidKeyError')
                        yield r

                if to_reporemove:
                    for r in ds.repo.remove(list(to_reporemove), r=True):
                        # these were removed, but we still need to save the
                        # removal

                        r_abs = opj(ds.path, r)
                        if r_abs in to_reporemove:
                            ap = to_reporemove[r_abs]
                        else:
                            ap = {'path': r_abs,
                                  'parentds': ds.path,
                                  'refds': refds_path
                                  }
                        ap['unavailable_path_status'] = ''
                        to_save.append(ap)
                        yield get_status_dict(
                            status='ok',
                            path=r,
                            **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                # TODO compose hand-selected annotated paths
                path=to_save,
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path
                        if (refds_path and GitRepo.is_valid_repo(refds_path))
                        else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #16
0
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            files = [ap['path'] for ap in content]

            for r in ds.repo.unlock(files):
                yield get_status_dict(path=r,
                                      status='ok',
                                      type='file',
                                      **res_kwargs)
Beispiel #17
0
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 missing='fail',
                 force=False,
                 transfer_data='auto',
                 recursive=False,
                 recursion_limit=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_copy_opts=None,
                 jobs=None):

        # if ever we get a mode, for "with-data" we would need this
        #if dataset and not path:
        #    # act on the whole dataset if nothing else was specified
        #    path = dataset.path if isinstance(dataset, Dataset) else dataset

        if not dataset and not path:
            # try to find a dataset in PWD
            dataset = require_dataset(None,
                                      check_installed=True,
                                      purpose='publishing')

        if since and not dataset:
            raise InsufficientArgumentsError(
                'Modification detection (--since) without a base dataset '
                'is not supported')

        if dataset and since == '':
            # only update since last update so we figure out what was the last update
            active_branch = dataset.repo.get_active_branch()
            if to:
                # XXX here we assume one to one mapping of names from local branches
                # to the remote
                since = '%s/%s' % (to, active_branch)
            else:
                # take tracking remote for the active branch
                tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch(
                )
                if tracked_remote:
                    if tracked_refspec.startswith('refs/heads/'):
                        tracked_refspec = tracked_refspec[len('refs/heads/'):]
                    #to = tracked_remote
                    since = '%s/%s' % (tracked_remote, tracked_refspec)
                else:
                    lgr.info(
                        "No tracked remote for %s. since option is of no effect",
                        active_branch)
                    since = None

        # here is the plan
        # 1. figure out remote to publish to
        # 2. figure out which content needs to be published to this remote
        # 3. look for any pre-publication dependencies of that remote
        #    (i.e. remotes that need to be published to before)
        # 4. publish the content needed to go to the primary remote to
        #    the dependencies first, and to the primary afterwards
        ds_remote_info = {}

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(refds=refds_path, logger=lgr, action='publish')

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='publish',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore',
                force_no_revision_change_discovery=
                False,  # we cannot publish what was not committed
                force_untracked_discovery=False  # we cannot publish untracked
        ):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            remote_info_result = None
            if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset':
                # for everything that is not a dataset get the remote info
                # for the parent
                parentds = ap.get('parentds', None)
                if parentds and parentds not in ds_remote_info:
                    remote_info_result = _get_remote_info(
                        parentds, ds_remote_info, to, missing)
            else:
                # this is a dataset
                if ap.get('state', None) == 'absent':
                    continue
                # get the remote info for itself
                remote_info_result = _get_remote_info(ap['path'],
                                                      ds_remote_info, to,
                                                      missing)
                ap['process_content'] = True
            if remote_info_result is not None:
                ap['status'] = remote_info_result[0]
                ap['message'] = remote_info_result[1]
                yield ap
                continue
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        lgr.debug("Evaluating %i dataset publication candidate(s)",
                  len(content_by_ds))
        # TODO: fancier sorting, so we still follow somewhat the hierarchy
        #       in sorted order, e.g.
        #  d1/sub1/sub1
        #  d1/sub1
        #  d1
        #  d2/sub1
        #  d2
        content_by_ds = OrderedDict(
            (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True))

        lgr.debug("Attempt to publish %i datasets", len(content_by_ds))
        for ds_path in content_by_ds:
            remote_info = ds_remote_info.get(ds_path, None)
            if remote_info is None:
                # maybe this dataset wasn't annotated above, try to get info
                # MIH: I think this entire if-branch is practically impossible
                # to reach. It is certainly untested, but I think this is due
                # to mutually exclusive conditions during remote_info detection
                remote_info_result = _get_remote_info(ds_path, ds_remote_info,
                                                      to, missing)
                if remote_info_result is not None:
                    yield get_status_dict(type='dataset',
                                          path=ds_path,
                                          status=remote_info_result[0],
                                          message=remote_info_result[1],
                                          **res_kwargs)
                    continue
                # continue with freshly obtained info
                remote_info = ds_remote_info[ds_path]
                # condition above must catch all other cases
                assert remote_info
            # and publish
            ds = Dataset(ds_path)
            for r in _publish_dataset(
                    ds,
                    remote=remote_info['remote'],
                    refspec=remote_info.get('refspec', None),
                    # only send paths that were explicitly requested
                    paths=
                [
                    p for p in content_by_ds[ds_path]
                    # do not feed (sub)dataset paths into the beast
                    # makes no sense to try to annex copy them
                    # for the base dataset itself let `transfer_data`
                    # decide
                    if p.get('type', None) != 'dataset'
                ],
                    annex_copy_options=annex_copy_opts,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **res_kwargs):
                yield r
Beispiel #18
0
    def __call__(
            path=None,
            dataset=None,
            to=None,
            since=None,
            missing='fail',
            force=False,
            transfer_data='auto',
            recursive=False,
            recursion_limit=None,
            git_opts=None,
            annex_opts=None,
            annex_copy_opts=None,
            jobs=None
    ):

        # if ever we get a mode, for "with-data" we would need this
        #if dataset and not path:
        #    # act on the whole dataset if nothing else was specified
        #    path = dataset.path if isinstance(dataset, Dataset) else dataset

        if not dataset and not path:
            # try to find a dataset in PWD
            dataset = require_dataset(
                None, check_installed=True, purpose='publishing')

        if since and not dataset:
            raise InsufficientArgumentsError(
                'Modification detection (--since) without a base dataset '
                'is not supported')

        if dataset and since == '':
            # only update since last update so we figure out what was the last update
            active_branch = dataset.repo.get_active_branch()
            if to:
                # XXX here we assume one to one mapping of names from local branches
                # to the remote
                since = '%s/%s' % (to, active_branch)
            else:
                # take tracking remote for the active branch
                tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch()
                if tracked_remote:
                    if tracked_refspec.startswith('refs/heads/'):
                        tracked_refspec = tracked_refspec[len('refs/heads/'):]
                    #to = tracked_remote
                    since = '%s/%s' % (tracked_remote, tracked_refspec)
                else:
                    lgr.info(
                        "No tracked remote for %s. since option is of no effect",
                        active_branch
                    )
                    since = None

        # here is the plan
        # 1. figure out remote to publish to
        # 2. figure out which content needs to be published to this remote
        # 3. look for any pre-publication dependencies of that remote
        #    (i.e. remotes that need to be published to before)
        # 4. publish the content needed to go to the primary remote to
        #    the dependencies first, and to the primary afterwards
        ds_remote_info = {}

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(refds=refds_path, logger=lgr, action='publish')

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='publish',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore',
                force_no_revision_change_discovery=False, # we cannot publish what was not committed
                force_untracked_discovery=False  # we cannot publish untracked
        ):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            remote_info_result = None
            if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset':
                # for everything that is not a dataset get the remote info
                # for the parent
                parentds = ap.get('parentds', None)
                if parentds and parentds not in ds_remote_info:
                    remote_info_result = _get_remote_info(
                        parentds, ds_remote_info, to, missing)
            else:
                # this is a dataset
                if ap.get('state', None) == 'absent':
                    continue
                # get the remote info for itself
                remote_info_result = _get_remote_info(
                    ap['path'], ds_remote_info, to, missing)
                ap['process_content'] = True
            if remote_info_result is not None:
                ap['status'] = remote_info_result[0]
                ap['message'] = remote_info_result[1]
                yield ap
                continue
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        lgr.debug(
            "Evaluating %i dataset publication candidate(s)",
            len(content_by_ds))
        # TODO: fancier sorting, so we still follow somewhat the hierarchy
        #       in sorted order, e.g.
        #  d1/sub1/sub1
        #  d1/sub1
        #  d1
        #  d2/sub1
        #  d2
        content_by_ds = OrderedDict(
            (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)
        )

        lgr.debug("Attempt to publish %i datasets", len(content_by_ds))
        for ds_path in content_by_ds:
            remote_info = ds_remote_info.get(ds_path, None)
            if remote_info is None:
                # maybe this dataset wasn't annotated above, try to get info
                # MIH: I think this entire if-branch is practically impossible
                # to reach. It is certainly untested, but I think this is due
                # to mutually exclusive conditions during remote_info detection
                remote_info_result = _get_remote_info(
                    ds_path, ds_remote_info, to, missing)
                if remote_info_result is not None:
                    yield get_status_dict(
                        type='dataset',
                        path=ds_path,
                        status=remote_info_result[0],
                        message=remote_info_result[1],
                        **res_kwargs)
                    continue
                # continue with freshly obtained info
                remote_info = ds_remote_info[ds_path]
                # condition above must catch all other cases
                assert remote_info
            # and publish
            ds = Dataset(ds_path)
            for r in _publish_dataset(
                    ds,
                    remote=remote_info['remote'],
                    refspec=remote_info.get('refspec', None),
                    # only send paths that were explicitly requested
                    paths=[p for p in content_by_ds[ds_path]
                           # do not feed (sub)dataset paths into the beast
                           # makes no sense to try to annex copy them
                           # for the base dataset itself let `transfer_data`
                           # decide
                           if p.get('type', None) != 'dataset'],
                    annex_copy_options=annex_copy_opts,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **res_kwargs):
                yield r
Beispiel #19
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #20
0
    def __call__(message=None, path=None, dataset=None,
                 all_updated=True, version_tag=None,
                 recursive=False, recursion_limit=None, super_datasets=False,
                 message_file=None
                 ):
        if not dataset and not path:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # make sure we don't treat this as a user-provided '.' argument
            path = [{'path': abspath(curdir), 'raw_input': False}]

        refds_path = Interface.get_refds_path(dataset)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_process = []
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                modified='HEAD' if not path and recursive else None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False):
                # this path was found untracked, but not explicitly given to save
                # we will silently ignore this
                continue
            got_nothing = False
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)
        lgr.log(2, "save, to_process=%r", to_process)
        if got_nothing and recursive and refds_path:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'save',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process]
            superdss = [ds.get_superdataset(topmost=True)
                        for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path: unique([ap['parentds']
                                    for ap in to_process if 'parentds' in ap])}
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique([ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(dict(
                    path=subds,
                    parentds=parentds,
                    type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(dict(
                    path=parentds,
                    type='dataset',
                    # make sure we save content of superds later on
                    process_content=True,
                    # but not do nasty things, like adding untracked content
                    # just because we discovered this dataset
                    process_updated_only=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(
                ds,
                content_by_ds[dspath],
                message=message)
            res['status'] = 'ok' if saved_state else 'notneeded'
            # MIH: let's tag even if there was nothing commit. I'd forget this
            # option too often...
            if version_tag:
                try:
                    # TODO: check whether comment below is still true after
                    # removing the log swallowing:
                    # again cannot help but force-silence low-level code, because
                    # it screams like a made man instead of allowing top-level
                    # code an orderly error report
                    ds.repo.tag(version_tag)
                    # even if we haven't saved anything
                    res['status'] = 'ok'
                    yield res
                except CommandError as e:
                    if saved_state:
                        # first we yield the result for the actual save
                        yield res
                    # and now complain that tagging didn't work
                    yield get_status_dict(
                        'save',
                        ds=ds,
                        logger=lgr,
                        status='error',
                        message=(
                            'cannot tag this version: %s',
                            e.stderr.strip()))
            else:
                yield res