Esempio n. 1
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    # expensive, access only once
    ds_repo = ds.repo
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = ensure_list(paths)
    if not hasattr(ds_repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    cmd = ['drop']
    if not check:
        cmd.append('--force')

    respath_by_status = {}
    try:
        yield from (_postproc_result(res, respath_by_status, ds)
                    for res in ds_repo._call_annex_records(cmd, files=paths))
    except CommandError as e:
        # pick up the results captured so far and yield them
        # the error will be amongst them
        yield from (_postproc_result(res, respath_by_status, ds)
                    for res in e.kwargs.get('stdout_json', []))
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds,
            paths,
            respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Esempio n. 2
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res,
            ds,
            type='file',
            **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds,
            paths,
            respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Esempio n. 3
0
def _get_targetpaths(ds, content, refds_path, source, jobs):
    # not ready for Path instances...
    content = [str(c) for c in content]
    # hand over to git-annex, get files content,
    # report files in git as 'notneeded' to get
    ds_repo = ds.repo
    # needs to be an annex to get content
    if not isinstance(ds_repo, AnnexRepo):
        for r in results_from_paths(
                content,
                status='notneeded',
                message="no dataset annex, content already present",
                action='get',
                type='file',
                logger=lgr,
                refds=refds_path):
            yield r
        return
    respath_by_status = {}
    try:
        results = ds_repo.get(content,
                              options=['--from=%s' % source] if source else [],
                              jobs=jobs)
    except CommandError as exc:
        results = exc.kwargs.get("stdout_json")
        if not results:
            raise

    for res in results:
        res = annexjson2result(res,
                               ds,
                               type='file',
                               logger=lgr,
                               refds=refds_path)
        success = success_status_map[res['status']]
        # TODO: in case of some failed commands (e.g. get) there might
        # be no path in the record.  yoh has only vague idea of logic
        # here so just checks for having 'path', but according to
        # results_from_annex_noinfo, then it would be assumed that
        # `content` was acquired successfully, which is not the case
        if 'path' in res:
            respath_by_status[success] = \
                respath_by_status.get(success, []) + [res['path']]
        yield res

    for r in results_from_annex_noinfo(
            ds,
            content,
            respath_by_status,
            dir_fail_msg='could not get some content in %s %s',
            noinfo_dir_msg='nothing to get from %s',
            noinfo_file_msg='already present',
            action='get',
            logger=lgr,
            refds=refds_path):
        yield r
Esempio n. 4
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res, ds, type='file', **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds, paths, respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Esempio n. 5
0
def _drop_files(ds, repo, paths, force=False, jobs=None):
    """Helper to drop content in datasets.

    Parameters
    ----------
    repo : AnnexRepo
    paths : list
      for which files to drop content
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks

    Yields
    ------
    dict
    """
    cmd = ['drop']
    if force:
        cmd.append('--force')
    if jobs:
        cmd.extend(['--jobs', str(jobs)])

    respath_by_status = {}
    try:
        yield from (_postproc_annexdrop_result(
            res, respath_by_status,
            ds) for res in repo._call_annex_records_items_(cmd, files=paths))
    except CommandError as e:
        # pick up the results captured so far and yield them
        # the error will be amongst them
        yield from (_postproc_annexdrop_result(res, respath_by_status, ds)
                    for res in e.kwargs.get('stdout_json', []))
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds,
            paths,
            respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content"):
        r['action'] = 'drop'
        yield r
Esempio n. 6
0
File: add.py Progetto: hanke/datalad
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Esempio n. 7
0
File: add.py Progetto: nellh/datalad
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
            # TODO check if next isn't covered by discover_dataset_trace_to_targets already??
            if dataset and ap.get('type', None) == 'dataset':
                # duplicates not possible, annotated_paths returns unique paths
                subds_to_add[ap['path']] = ap
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                # check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                if not subds.repo.get_hexsha():
                    yield get_status_dict(
                        ds=subds,
                        status='impossible',
                        message='cannot add subdataset with no commits',
                        **dict(common_report, **ap))
                    continue
                subds_relpath = relpath(ap['path'], ds_path)
                # make an attempt to configure a submodule source URL based on the
                # discovered remote configuration
                remote, branch = subds.repo.get_tracking_branch()
                subds_url = subds.repo.get_remote_url(
                    remote) if remote else None
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath,
                                          url=subds_url,
                                          name=None)
                except CommandError as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=e.stderr,
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                _fixup_submodule_dotgit_setup(ds, subds_relpath)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add(list(torepoadd.keys()),
                                git=to_git if is_annex else True,
                                **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Esempio n. 8
0
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=False,
        #git_opts=None,
        #annex_opts=None,
        #annex_get_opts=None,
        jobs='auto',
        verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get(
                    'raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap,
                           status='notneeded',
                           logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath,
                      ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(ds,
                                                      ap['path'],
                                                      reckless,
                                                      refds_path,
                                                      description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset'
                        and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible', message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get(
                        'raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] ==
                                'dataset' else ap['parentds'])
                lgr.info("Installing %s%s recursively", subds,
                         (" underneath %s" %
                          ap['path'] if subds.path != ap['path'] else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset'
                            and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok'
                            and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {
                        k: v
                        for k, v in res.items() if not k == 'status'
                    }
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert (not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [
                ap['path'] for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds.path
            ]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content,
                        status='notneeded',
                        message="no dataset annex, content already present",
                        action='get',
                        logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(content,
                                   options=['--from=%s' %
                                            source] if source else [],
                                   jobs=jobs):
                res = annexjson2result(res,
                                       ds,
                                       type='file',
                                       logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r
Esempio n. 9
0
File: get.py Progetto: hanke/datalad
    def __call__(
            path=None,
            source=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            get_data=True,
            description=None,
            reckless=False,
            #git_opts=None,
            #annex_opts=None,
            #annex_get_opts=None,
            jobs='auto',
            verbose=False,
    ):
        # IMPLEMENTATION CONCEPT:
        #
        # 1. Sort the world into existing handles and the rest
        # 2. Try locate missing handles (obtain subdatasets along the way)
        # 3. Expand into subdatasets with recursion enables (potentially
        #    obtain even more subdatasets
        # 4. Shoot info of which handles to get in each subdataset to,
        #    git-annex, once at the very end

        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # remember which results we already reported, to avoid duplicates
        yielded_ds = []
        to_get = []
        unavailable_paths = []
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='get',
                # NOTE: Do not act upon unavailable paths yet! Done below after
                # testing which ones could be obtained
                unavailable_path_status='',
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('state', None) == 'absent' and ap.get('raw_input', False):
                # if this wasn't found, but directly requested, queue for further
                # exploration
                unavailable_paths.append(ap)
                continue
            if ap.get('type', None) == 'dataset' and \
                    GitRepo.is_valid_repo(ap['path']) and \
                    not ap['path'] == refds_path:
                # do not report what hasn't arived yet
                # also do not report the base dataset that is already
                # present -- no surprise
                yield dict(ap, status='notneeded', logger=lgr,
                           message='already installed')
                yielded_ds.append(ap['path'])
                ap['process_content'] = get_data
            to_get.append(ap)

        # explore the unknown
        for ap in sorted(unavailable_paths, key=lambda x: x['path']):
            lgr.debug("Investigate yet unavailable path %s", ap)
            # how close can we get?
            dspath = ap.get('parentds', get_dataset_root(ap['path']))
            if dspath is None:
                # nothing we can do for this path
                continue
            lgr.debug("Found containing dataset %s for path %s", dspath, ap['path'])
            ds = Dataset(dspath)
            # now actually obtain whatever is necessary to get to this path
            containing_ds = [dspath]
            for res in _install_necessary_subdatasets(
                    ds, ap['path'], reckless, refds_path, description=description):
                # yield immediately so errors could be acted upon outside, before
                # we continue
                if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                    # unless we reported on this dataset before
                    if res['type'] == 'dataset':
                        # make a record, recursive below might now want to report
                        # a 'notneeded'
                        yielded_ds.append(res['path'])
                    yield res
                # update to the current innermost dataset
                containing_ds.append(res['path'])

            if len(containing_ds) < 2:
                # no subdataset was installed, hence if the path was unavailable
                # before it still is, no need to bother git annex
                ap.update(status='impossible',
                          message='path does not exist')
                yield ap
                continue
            # important to only do the next for the innermost subdataset
            # as the `recursive` logic below relies on that!
            # set the correct parent, for a dataset this would be the second-last
            # reported subdataset
            ap.update(parentds=containing_ds[-1])
            if containing_ds[-1] == ap['path']:
                # the path actually refers to the last installed dataset
                ap.update(parentds=containing_ds[-2],
                          process_content=get_data,
                          type='dataset')
            to_get.append(ap)

        # results of recursive installation of yet undiscovered datasets
        rec_get = []
        if recursive and not recursion_limit == 'existing':
            # obtain any subdatasets underneath the paths given inside the
            # subdatasets that we know already exist
            # unless we do not want recursion into not-yet-installed datasets
            for ap in sorted(to_get, key=lambda x: x['path']):
                if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False):
                    # a non-directory cannot have content underneath
                    # also we do NOT want to recurse into anything that was specifically
                    # requested, to avoid duplication
                    continue
                subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds'])
                lgr.info(
                    "Installing %s%s recursively",
                    subds,
                    (" underneath %s" % ap['path']
                     if subds.path != ap['path']
                     else ""))
                for res in _recursive_install_subds_underneath(
                        subds,
                        # `ap['path']` was explicitly given as input
                        # we count recursions from the input, hence we
                        # can start with the full number
                        recursion_limit,
                        reckless,
                        start=ap['path'],
                        refds_path=refds_path,
                        description=description):
                    # yield immediately so errors could be acted upon
                    # outside, before we continue
                    if not (res['type'] == 'dataset' and res['path'] in yielded_ds):
                        # unless we reported on this dataset before
                        if res['type'] == 'dataset':
                            # make a record
                            yielded_ds.append(res['path'])
                    yield res
                    if not (res['status'] == 'ok' and res['type'] == 'dataset'):
                        # not a dataset that was just installed, we just reported it
                        # upstairs, and can ignore it from now on
                        continue
                    # paranoia, so popular these days...
                    assert GitRepo.is_valid_repo(res['path'])
                    # keep a copy of the install record for `get` later on
                    get_ap = {k: v for k, v in res.items()
                              if not k == 'status'}
                    get_ap['process_content'] = get_data
                    rec_get.append(get_ap)

        if not get_data:
            # done already
            return

        # merge the two AP lists
        to_get.extend(rec_get)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_get,
                refds_path=refds_path)
        assert(not completed)

        # hand over to git-annex, get files content,
        # report files in git as 'notneeded' to get
        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            # grab content, ignore subdataset entries
            content = [ap['path'] for ap in content_by_ds[ds_path]
                       if ap.get('type', None) != 'dataset' or ap['path'] == ds.path]
            if not content:
                # cut this short should there be nothing
                continue
            # needs to be an annex to get content
            if not isinstance(ds.repo, AnnexRepo):
                for r in results_from_paths(
                        content, status='notneeded',
                        message="no dataset annex, content already present",
                        action='get', logger=lgr,
                        refds=refds_path):
                    yield r
                continue
            respath_by_status = {}
            for res in ds.repo.get(
                    content,
                    options=['--from=%s' % source] if source else [],
                    jobs=jobs):
                res = annexjson2result(res, ds, type='file', logger=lgr,
                                       refds=refds_path)
                success = success_status_map[res['status']]
                # TODO: in case of some failed commands (e.g. get) there might
                # be no path in the record.  yoh has only vague idea of logic
                # here so just checks for having 'path', but according to
                # results_from_annex_noinfo, then it would be assumed that
                # `content` was acquired successfully, which is not the case
                if 'path' in res:
                    respath_by_status[success] = \
                        respath_by_status.get(success, []) + [res['path']]
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    content,
                    respath_by_status,
                    dir_fail_msg='could not get some content in %s %s',
                    noinfo_dir_msg='nothing to get from %s',
                    noinfo_file_msg='already present',
                    action='get',
                    logger=lgr,
                    refds=refds_path):
                yield r