Python Save Examples

Programming Language: Python

Namespace/Package Name: datalad.interface.save

Class/Type: Save

Examples at hotexamples.com: 15

Python Save - 15 examples found. These are the top rated real world Python examples of datalad.interface.save.Save extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

__call__(9)

_prep(1)

Frequently Used Methods

__call__ (9)

_prep (1)

Example #1

Show file

def handle_dirty_dataset(ds, mode, msg=None):
    """Detect and treat unsaved changes as instructed by `mode`

    Parameters
    ----------
    ds : Dataset or None
      Dataset to be inspected. Does nothing if `None`.
    mode : {'fail', 'ignore', 'save-before'}
      How to act upon discovering unsaved changes.
    msg : str or None
      Custom message to use for a potential commit.

    Returns
    -------
    None
    """
    if ds is None:
        # nothing to be handled
        return
    if msg is None:
        msg = '[DATALAD] auto-saved changes'
    if mode == 'ignore':
        return
    elif mode == 'fail':
        if not ds.repo or ds.repo.repo.is_dirty(index=True,
                                                working_tree=True,
                                                untracked_files=True,
                                                submodules=True):
            raise RuntimeError('dataset {} has unsaved changes'.format(ds))
    elif mode == 'save-before':
        if not ds.is_installed():
            raise RuntimeError('dataset {} is not yet installed'.format(ds))
        Save.__call__(dataset=ds, message=msg, auto_add_changes=True)
    else:
        raise ValueError("unknown if-dirty mode '{}'".format(mode))

Example #2

Show file

File: utils.py Project: jelmer/datalad

def handle_dirty_dataset(ds, mode, msg=None):
    """Detect and treat unsaved changes as instructed by `mode`

    Parameters
    ----------
    ds : Dataset or None
      Dataset to be inspected. Does nothing if `None`.
    mode : {'fail', 'ignore', 'save-before'}
      How to act upon discovering unsaved changes.
    msg : str or None
      Custom message to use for a potential commit.

    Returns
    -------
    None
    """
    if ds is None:
        # nothing to be handled
        return
    if msg is None:
        msg = '[DATALAD] auto-saved changes'

    # make sure that all pending changes (batched annex operations, etc.)
    # are actually reflected in Git
    if ds.repo:
        ds.repo.precommit()

    if mode == 'ignore':
        return
    elif mode == 'fail':
        if not ds.repo or ds.repo.is_dirty(index=True,
                                           untracked_files=True,
                                           submodules=True):
            raise RuntimeError('dataset {} has unsaved changes'.format(ds))
    elif mode == 'save-before':
        if not ds.is_installed():
            raise RuntimeError('dataset {} is not yet installed'.format(ds))
        from datalad.interface.save import Save
        Save.__call__(dataset=ds, message=msg)
    else:
        raise ValueError("unknown if-dirty mode '{}'".format(mode))

Example #3

Show file

File: utils.py Project: hanke/datalad

def handle_dirty_dataset(ds, mode, msg=None):
    """Detect and treat unsaved changes as instructed by `mode`

    Parameters
    ----------
    ds : Dataset or None
      Dataset to be inspected. Does nothing if `None`.
    mode : {'fail', 'ignore', 'save-before'}
      How to act upon discovering unsaved changes.
    msg : str or None
      Custom message to use for a potential commit.

    Returns
    -------
    None
    """
    if ds is None:
        # nothing to be handled
        return
    if msg is None:
        msg = '[DATALAD] auto-saved changes'

    # make sure that all pending changes (batched annex operations, etc.)
    # are actually reflected in Git
    if ds.repo:
        ds.repo.precommit()

    if mode == 'ignore':
        return
    elif mode == 'fail':
        if not ds.repo or ds.repo.is_dirty(index=True,
                                           untracked_files=True,
                                           submodules=True):
            raise RuntimeError('dataset {} has unsaved changes'.format(ds))
    elif mode == 'save-before':
        if not ds.is_installed():
            raise RuntimeError('dataset {} is not yet installed'.format(ds))
        from datalad.interface.save import Save
        Save.__call__(dataset=ds, message=msg)
    else:
        raise ValueError("unknown if-dirty mode '{}'".format(mode))

Example #4

Show file

File: add.py Project: hanke/datalad

    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #5

Show file

File: add.py Project: nellh/datalad

    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
            # TODO check if next isn't covered by discover_dataset_trace_to_targets already??
            if dataset and ap.get('type', None) == 'dataset':
                # duplicates not possible, annotated_paths returns unique paths
                subds_to_add[ap['path']] = ap
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                # check that the subds has a commit, and refuse
                # to operate on it otherwise, or we would get a bastard
                # submodule that cripples git operations
                if not subds.repo.get_hexsha():
                    yield get_status_dict(
                        ds=subds,
                        status='impossible',
                        message='cannot add subdataset with no commits',
                        **dict(common_report, **ap))
                    continue
                subds_relpath = relpath(ap['path'], ds_path)
                # make an attempt to configure a submodule source URL based on the
                # discovered remote configuration
                remote, branch = subds.repo.get_tracking_branch()
                subds_url = subds.repo.get_remote_url(
                    remote) if remote else None
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath,
                                          url=subds_url,
                                          name=None)
                except CommandError as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=e.stderr,
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                _fixup_submodule_dotgit_setup(ds, subds_relpath)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add(list(torepoadd.keys()),
                                git=to_git if is_annex else True,
                                **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #6

Show file

    def __call__(path=None,
                 dataset=None,
                 merge_native='init',
                 recursive=False,
                 recursion_limit=None,
                 save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='metadata aggregation')
        # always include the reference dataset
        path = assure_list(path)
        path.append(ds.path)

        agginfo_db = {}
        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert ('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'], aggsrc)
                else:
                    lgr.info('Aggregate metadata for %s from dataset at %s',
                             ap['path'], aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(aggsrc, ap['path'],
                                                   recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(status='impossible',
                                          message=res,
                                          action='aggregate_metadata',
                                          path=ap['path'],
                                          logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _extract_metadata(ds, Dataset(aggsrc), agginfo_db,
                                            merge_native, to_save)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message=
                        'Metadata extraction failed (see previous error message)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        ds_adj = {}
        discover_dataset_trace_to_targets(ds.path, to_aggregate, [], ds_adj)
        # TODO we need to work in the info about dataset that we only got from
        # aggregated metadata, that had no trace on the file system in here!!
        subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s',
                     parentds_path)

            _update_ds_agginfo(ds.path, parentds_path, subtrees[parentds_path],
                               agginfo_db, to_save)
            # update complete
            yield get_status_dict(status='ok',
                                  action='aggregate_metadata',
                                  path=parentds_path,
                                  type='dataset',
                                  logger=lgr)
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #7

Show file

File: remove.py Project: shots47s/datalad

    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 check=True,
                 save=True,
                 message=None,
                 if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset"
            )
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path,
                                  status='notneeded',
                                  **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type',
                                                     None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = dict()
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds,
                                                    check=check,
                                                    has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state',
                                None) != 'absent' and GitRepo.is_valid_repo(
                                    ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                # use annotate path as input, but pass a copy because
                                # we cannot rely on it being unaltered by reannotation
                                # TODO maybe adjust annotate_path to do that
                            [ap.copy()],
                                dataset=refds_path,
                                recursive=recursive,
                                check=check,
                                if_dirty=if_dirty,
                                result_xfm=None,
                                result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'],
                                                start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(
                            dict(path=opj(parentds.path, '.gitmodules'),
                                 parents=parentds.path,
                                 type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove[ap['path']] = ap
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, list(to_reporemove), check=True):
                        if r['status'] == 'error':
                            # if drop errored on that path, we can't remove it
                            to_reporemove.pop(r['path'], 'avoidKeyError')
                        yield r

                if to_reporemove:
                    for r in ds.repo.remove(list(to_reporemove), r=True):
                        # these were removed, but we still need to save the
                        # removal

                        r_abs = opj(ds.path, r)
                        if r_abs in to_reporemove:
                            ap = to_reporemove[r_abs]
                        else:
                            ap = {
                                'path': r_abs,
                                'parentds': ds.path,
                                'refds': refds_path
                            }
                        ap['unavailable_path_status'] = ''
                        to_save.append(ap)
                        yield get_status_dict(status='ok',
                                              path=r,
                                              **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                # TODO compose hand-selected annotated paths
                path=to_save,
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path if
            (refds_path and GitRepo.is_valid_repo(refds_path)) else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #8

Show file

File: metadata.py Project: adhvaithrp/datalad

    def __call__(path=None,
                 dataset=None,
                 add=None,
                 init=None,
                 remove=None,
                 reset=None,
                 define_key=None,
                 dataset_global=False,
                 recursive=False,
                 recursion_limit=None):
        # bring metadataset setter args in shape first
        untag, remove = _parse_argspec(remove)
        purge, reset = _parse_argspec(reset)
        tag_add, add = _parse_argspec(add)
        tag_init, init = _parse_argspec(init)
        define_key = dict(define_key) if define_key else None
        # merge all potential sources of tag specifications
        all_untag = remove.get('tag', []) + untag
        if all_untag:
            remove['tag'] = all_untag
        all_addtag = add.get('tag', []) + tag_add
        if all_addtag:
            add['tag'] = all_addtag
        all_inittag = init.get('tag', []) + tag_init
        if all_inittag:
            init['tag'] = all_inittag

        lgr.debug("Will 'init' metadata items: %s", init)
        lgr.debug("Will 'add' metadata items: %s", add)
        lgr.debug("Will 'remove' metadata items: %s", remove)
        lgr.debug("Will 'reset' metadata items: %s", reset)
        lgr.debug("Will 'purge' metadata items: %s", purge)

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(dataset=refds_path,
                                         path=path,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         action='metadata',
                                         unavailable_path_status='error',
                                         nondataset_path_status='error',
                                         force_subds_discovery=False,
                                         return_type='generator',
                                         on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                if ap.get('state', None) == 'absent':
                    # just discovered via recursion, but not relevant here
                    continue
                if GitRepo.is_valid_repo(ap['path']):
                    ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        # iterate over all datasets, order doesn't matter
        to_save = []
        for ds_path in content_by_ds:
            # ignore submodule entries
            content = [
                ap for ap in content_by_ds[ds_path]
                if ap.get('type', None) != 'dataset' or ap['path'] == ds_path
            ]
            if not content:
                # nothing other than subdatasets were given or discovered in
                # this dataset, ignore
                continue
            ds = Dataset(ds_path)
            if dataset_global or define_key:
                db_path = opj(ds.path, '.datalad', 'metadata', 'dataset.json')
                db = {}
                if exists(db_path):
                    db_fp = open(db_path)
                    # need to read manually, load() would puke on an empty file
                    db_content = db_fp.read()
                    # minimize time for collision
                    db_fp.close()
                    if db_content:
                        db = json.loads(db_content)
                # TODO make manipulation order identical to what git-annex does
                for k, v in init.items() if init else []:
                    if k not in db:
                        db[k] = v
                for k in purge:
                    if k in db:
                        del db[k]
                for k, v in reset.items():
                    db[k] = v
                for k, v in add.items():
                    db[k] = sorted(unique(db.get(k, []) + v))
                for k, v in remove.items():
                    existing_data = db.get(k, [])
                    if isinstance(existing_data, dict):
                        db[k] = {
                            dk: existing_data[dk]
                            for dk in set(existing_data).difference(v)
                        }
                    else:
                        db[k] = list(set(existing_data).difference(v))
                    # wipe out if empty
                    if not db[k]:
                        del db[k]

                added_def = False
                if define_key:
                    defs = db.get('definition', {})
                    for k, v in define_key.items():
                        if k in defs:
                            if not defs[k] == v:
                                yield get_status_dict(
                                    status='error',
                                    ds=ds,
                                    message=
                                    ("conflicting definition for key '%s': '%s' != '%s'",
                                     k, v, defs[k]),
                                    **res_kwargs)
                                continue
                        else:
                            defs[k] = v
                            added_def = True
                    db['definition'] = defs
                # store, if there is anything
                if db:
                    if not exists(dirname(db_path)):
                        makedirs(dirname(db_path))
                    db_fp = open(db_path, 'w')
                    # produce relatively compact, but also diff-friendly format
                    json.dump(db,
                              db_fp,
                              indent=0,
                              separators=(',', ':\n'),
                              sort_keys=True)
                    # minimize time for collision
                    db_fp.close()
                    # use add not save to also cover case of a fresh file
                    ds.add(db_path, save=False)
                    to_save.append(
                        dict(path=db_path, parentds=ds.path, type='file'))
                elif exists(db_path):
                    # no metadata left, kill file
                    ds.remove(db_path)
                    to_save.append(dict(path=ds.path, type='dataset'))
                if added_def or init or add or remove or reset or purge:
                    # if anything happended or could have happended
                    yield get_status_dict(status='ok',
                                          ds=ds,
                                          metadata=db,
                                          **res_kwargs)
            elif not isinstance(ds.repo, AnnexRepo):
                # report on all explicitly requested paths only
                for ap in [c for c in content if ap.get('raw_input', False)]:
                    yield dict(
                        ap,
                        status='impossible',
                        message=(
                            'non-annex dataset %s has no file metadata support',
                            ds),
                        **res_kwargs)
                continue
            ds_paths = [p['path'] for p in content]
            if not dataset_global:
                if reset or purge or add or init or remove:
                    # file metadata manipulation
                    mod_paths = []
                    for mp in ds.repo.set_metadata(
                            ds_paths,
                            reset=reset,
                            add=add,
                            init=init,
                            remove=remove,
                            purge=purge,
                            # we always go recursive
                            # TODO is that a good thing? But how to otherwise distinuish
                            # this kind of recursive from the one across datasets in
                            # the API?
                            recursive=True):
                        if mp.get('success', False):
                            mod_paths.append(mp['file'])
                        else:
                            yield get_status_dict(
                                status='error',
                                message='setting metadata failed',
                                path=opj(ds.path, mp[0]),
                                type='file',
                                **res_kwargs)
                    # query the actually modified paths only
                    ds_paths = mod_paths

                # and lastly, query -- even if we set before -- there could
                # be side-effect from multiple set paths on an individual
                # path, hence we need to query to get the final result
                for file, meta in ds.repo.get_metadata(ds_paths):
                    r = get_status_dict(status='ok',
                                        path=opj(ds.path, file),
                                        type='file',
                                        metadata=meta,
                                        **res_kwargs)
                    yield r
        # save potential modifications to dataset global metadata
        if not to_save:
            return
        for res in Save.__call__(path=to_save,
                                 dataset=refds_path,
                                 message='[DATALAD] dataset metadata update',
                                 return_type='generator',
                                 result_xfm=None,
                                 result_filter=None,
                                 on_failure='ignore'):
            yield res

Example #9

Show file

File: remove.py Project: datalad/datalad

    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            check=True,
            save=True,
            message=None,
            if_dirty='save-before'):
        res_kwargs = dict(action='remove', logger=lgr)
        if not dataset and not path:
            raise InsufficientArgumentsError(
                "insufficient information for `remove`: requires at least a path or dataset")
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs['refds'] = refds_path
        if refds_path and not path and not GitRepo.is_valid_repo(refds_path):
            # nothing here, nothing to remove
            yield get_status_dict(path=refds_path, status='notneeded', **res_kwargs)
            return
        if refds_path and not path:
            # act on the whole dataset if nothing else was specified
            # TODO i think that would happen automatically in annotation?
            path = refds_path

        to_process = []

        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                # we only ever want to discover immediate subdatasets, the rest
                # will happen in `uninstall`
                recursion_limit=1,
                action='remove',
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None) is None:
                # nothing exists at location, and there is no parent to
                # remove from
                ap['status'] = 'notneeded'
                ap['message'] = "path does not exist and is not in a dataset"
                yield ap
                continue
            if ap.get('raw_input', False) and ap.get('type', None) == 'dataset':
                # make sure dataset sorting yields a dedicted entry for this one
                ap['process_content'] = True
            to_process.append(ap)

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if path_is_under([ap['path'] for ap in to_process]):
            # behave like `rm` and refuse to remove where we are
            raise ValueError(
                "refusing to uninstall current or parent directory")

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        # to make the removal of dataset content known upstairs
        to_save = []
        # track which submodules we have removed in the process, to avoid
        # failure in case we revisit them due to a subsequent path argument
        subm_removed = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            paths = content_by_ds[ds_path]
            to_reporemove = dict()
            # PLAN any dataset that was not raw_input, uninstall (passing recursive flag)
            # if dataset itself is in paths, skip any nondataset
            # sort reverse so we get subdatasets first
            for ap in sorted(paths, key=lambda x: x['path'], reverse=True):
                if ap.get('type', None) == 'dataset':
                    # entire dataset needs to go, uninstall if present, pass recursive!
                    uninstall_failed = False
                    if ap['path'] == refds_path or \
                            (refds_path is None and ap.get('raw_input', False)):
                        # top-level handling, cannot use regular uninstall call, as
                        # it will refuse to uninstall a top-level dataset
                        # and rightfully so, it is really a remove in that case
                        # bypass all the safety by using low-level helper
                        for r in _uninstall_dataset(ds, check=check, has_super=False,
                                                    **res_kwargs):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            r['refds'] = refds_path
                            yield r
                    # recheck that it wasn't removed during a previous iteration
                    elif ap.get('state', None) != 'absent' and GitRepo.is_valid_repo(ap['path']):
                        # anything that is not the top-level -> regular uninstall
                        # this is for subdatasets of the to-be-removed dataset
                        # we want to simply uninstall them in a regular manner
                        for r in Uninstall.__call__(
                                # use annotate path as input, but pass a copy because
                                # we cannot rely on it being unaltered by reannotation
                                # TODO maybe adjust annotate_path to do that
                                [ap.copy()],
                                dataset=refds_path, recursive=recursive, check=check,
                                if_dirty=if_dirty, result_xfm=None, result_filter=None,
                                on_failure='ignore'):
                            if r['status'] in ('impossible', 'error'):
                                # we need to inspect if something went wrong, in order
                                # to prevent failure from removing a non-empty dir below,
                                # but at the same time allow for continued processing
                                uninstall_failed = True
                            yield r
                    if not ap.get('raw_input', False):
                        # we only ever want to actually unregister subdatasets that
                        # were given explicitly
                        continue
                    if not uninstall_failed and \
                            not ap['path'] in subm_removed and \
                            refds_path and \
                            ap.get('parentds', None) and \
                            not (relpath(ap['path'], start=refds_path).startswith(pardir) or
                                 ap['path'] == refds_path) and \
                            ap.get('registered_subds', False):
                        # strip from superdataset, but only if a dataset was given explcitly
                        # as in "remove from this dataset", but not when just a path was given
                        # as in "remove from the filesystem"
                        subds_relpath = relpath(ap['path'], start=ap['parentds'])
                        # remove submodule reference
                        parentds = Dataset(ap['parentds'])
                        # play safe, will fail on dirty
                        parentds.repo.deinit_submodule(ap['path'])
                        # remove now empty submodule link
                        parentds.repo.remove(ap['path'])
                        # make a record that we removed this already, should it be
                        # revisited via another path argument, because do not reannotate
                        # the paths after every removal
                        subm_removed.append(ap['path'])
                        yield dict(ap, status='ok', **res_kwargs)
                        # need .gitmodules update in parent
                        to_save.append(dict(
                            path=opj(parentds.path, '.gitmodules'),
                            parents=parentds.path,
                            type='file'))
                        # and the removal itself needs to be committed
                        # inform `save` that it is OK that this path
                        # doesn't exist on the filesystem anymore
                        ap['unavailable_path_status'] = ''
                        ap['process_content'] = False
                        to_save.append(ap)
                    if not uninstall_failed and exists(ap['path']):
                        # could be an empty dir in case an already uninstalled subdataset
                        # got removed
                        rmdir(ap['path'])
                else:
                    # anything that is not a dataset can simply be passed on
                    to_reporemove[ap['path']] = ap
            # avoid unnecessary git calls when there is nothing to do
            if to_reporemove:
                if check and hasattr(ds.repo, 'drop'):
                    for r in _drop_files(ds, list(to_reporemove),
                                         check=True):
                        if r['status'] == 'error':
                            # if drop errored on that path, we can't remove it
                            to_reporemove.pop(r['path'], 'avoidKeyError')
                        yield r

                if to_reporemove:
                    for r in ds.repo.remove(list(to_reporemove), r=True):
                        # these were removed, but we still need to save the
                        # removal

                        r_abs = opj(ds.path, r)
                        if r_abs in to_reporemove:
                            ap = to_reporemove[r_abs]
                        else:
                            ap = {'path': r_abs,
                                  'parentds': ds.path,
                                  'refds': refds_path
                                  }
                        ap['unavailable_path_status'] = ''
                        to_save.append(ap)
                        yield get_status_dict(
                            status='ok',
                            path=r,
                            **res_kwargs)

        if not to_save:
            # nothing left to do, potentially all errored before
            return
        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        for res in Save.__call__(
                # TODO compose hand-selected annotated paths
                path=to_save,
                # we might have removed the reference dataset by now, recheck
                dataset=refds_path
                        if (refds_path and GitRepo.is_valid_repo(refds_path))
                        else None,
                message=message if message else '[DATALAD] removed content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #10

Show file

File: test_utils.py Project: debanjum/datalad

def test_interface_prep():
    # verify sanity if nothing was given, as it would look like from the
    # cmdline
    assert_equal(Save._prep(path=[], dataset=None), ({}, []))

Example #11

Show file

File: add.py Project: silky/datalad

    def __call__(path=None,
                 source=None,
                 dataset=None,
                 to_git=False,
                 save=True,
                 recursive=False,
                 recursion_limit=None,
                 if_dirty='ignore',
                 git_opts=None,
                 annex_opts=None,
                 annex_add_opts=None,
                 jobs=None):

        # parameter constraints:
        if not path and not source:
            raise InsufficientArgumentsError(
                "insufficient information for "
                "adding: requires at least a path "
                "or a source.")

        # When called from cmdline `path` and `source` will be a list even if
        # there is only one item.
        # Make sure we deal with the same when called via python API:
        # always yields list; empty if None
        path = assure_list(path)
        source = assure_list(source)

        # TODO: Q: are the list operations in the following 3 blocks (resolving
        #          paths, sources and datasets) guaranteed to be stable
        #          regarding order?

        # resolve path(s):
        # TODO: RF: resolve_path => datalad.utils => more general (repos => normalize paths)
        resolved_paths = [resolve_path(p, dataset) for p in path]

        # must come after resolve_path()!!
        # resolve dataset:
        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='adding')
        handle_dirty_dataset(dataset, if_dirty)

        # resolve source(s):
        resolved_sources = []
        for s in source:
            if not is_datalad_compat_ri(s):
                raise ValueError("invalid source parameter: %s" % s)
            resolved_sources.append(_get_git_url_from_source(s))

        # find (sub-)datasets to add things to (and fail on invalid paths):
        if recursive:

            # 1. Find the (sub-)datasets containing the given path(s):
            # Note, that `get_containing_subdataset` raises if `p` is
            # outside `dataset`, but it returns `dataset`, if `p` is inside
            # a subdataset not included by `recursion_limit`. In the latter
            # case, the git calls will fail instead.
            # We could check for this right here and fail early, but this
            # would lead to the need to discover the entire hierarchy no
            # matter if actually required.
            resolved_datasets = [
                dataset.get_containing_subdataset(
                    p, recursion_limit=recursion_limit) for p in resolved_paths
            ]

            # 2. Find implicit subdatasets to call add on:
            # If there are directories in resolved_paths (Note,
            # that this includes '.' and '..'), check for subdatasets
            # beneath them. These should be called recursively with '.'.
            # Therefore add the subdatasets to resolved_datasets and
            # corresponding '.' to resolved_paths, in order to generate the
            # correct call.
            for p in resolved_paths:
                if isdir(p):
                    for subds_path in \
                        dataset.get_subdatasets(absolute=True, recursive=True,
                                                recursion_limit=recursion_limit):
                        if subds_path.startswith(_with_sep(p)):
                            resolved_datasets.append(Dataset(subds_path))
                            resolved_paths.append(curdir)

        else:
            # if not recursive, try to add everything to dataset itself:
            resolved_datasets = [dataset for i in range(len(resolved_paths))]

        # we need a resolved dataset per path:
        assert len(resolved_paths) == len(resolved_datasets)

        # sort parameters for actual git/git-annex calls:
        # (dataset, path, source)
        from six.moves import zip_longest

        param_tuples = list(
            zip_longest(resolved_datasets, resolved_paths, resolved_sources))
        # possible None-datasets in `param_tuples` were filled in by zip_longest
        # and need to be replaced by `dataset`:
        param_tuples = [(d if d is not None else dataset, p, s)
                        for d, p, s in param_tuples]

        calls = {
            d.path: {  # list of paths to 'git-add':
                'g_add': [],
                # list of paths to 'git-annex-add':
                'a_add': [],
                # list of sources to 'git-annex-addurl':
                'addurl_s': [],
                # list of (path, source) to
                # 'git-annex-addurl --file':
                'addurl_f': []
            }
            for d in [i for i, p, s in param_tuples]
        }

        for ds, p, s in param_tuples:
            # it should not happen, that `path` as well as `source` are None:
            assert p or s
            if not s:
                # we have a path only
                # Do not try to add to annex whenever there is no annex
                if to_git or not isinstance(ds.repo, AnnexRepo):
                    calls[ds.path]['g_add'].append(p)
                else:
                    calls[ds.path]['a_add'].append(p)
            elif not p:
                # we have a source only
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_s'].append(s)
            else:
                # we have a path and a source
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_f'].append((p, s))

        # now do the actual add operations:
        # TODO: implement git/git-annex/git-annex-add options

        datasets_return_values = defaultdict(list)
        for dspath in calls:
            ds = Dataset(dspath)
            return_values = datasets_return_values[dspath]
            lgr.info("Processing dataset %s ..." % ds)

            # check every (sub-)dataset for annex once, since we can't add or
            # addurl anything, if there is no annex:
            # TODO: Q: Alternatively, just call git-annex-init if there's no
            # annex yet and we have an annex-add/annex-addurl request?
            _is_annex = isinstance(ds.repo, AnnexRepo)

            if calls[ds.path]['g_add']:
                lgr.debug("Adding %s to git", calls[dspath]['g_add'])
                added = ds.repo.add(calls[dspath]['g_add'],
                                    git=True,
                                    git_options=git_opts)
                return_values.extend(added)
            if calls[ds.path]['a_add']:
                if _is_annex:
                    lgr.debug("Adding %s to annex", calls[dspath]['a_add'])
                    return_values.extend(
                        ds.repo.add(calls[dspath]['a_add'],
                                    git=False,
                                    jobs=jobs,
                                    git_options=git_opts,
                                    annex_options=annex_opts,
                                    options=annex_add_opts))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-add' for "
                              "files {1}".format(ds, calls[dspath]['a_add']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['a_add']])

            # TODO: AnnexRepo.add_urls' return value doesn't contain the created
            #       file name but the url
            if calls[ds.path]['addurl_s']:
                if _is_annex:
                    lgr.debug("Adding urls %s to annex",
                              calls[dspath]['addurl_s'])
                    return_values.extend(
                        ds.repo.add_urls(
                            calls[ds.path]['addurl_s'],
                            options=annex_add_opts,
                            # TODO: extra parameter for addurl?
                            git_options=git_opts,
                            annex_options=annex_opts,
                            jobs=jobs,
                        ))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_s']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_s']])

            if calls[ds.path]['addurl_f']:
                if _is_annex:
                    for f, u in calls[ds.path]['addurl_f']:
                        lgr.debug("Adding urls %s to files in annex",
                                  calls[dspath]['addurl_f'])
                        return_values.append(
                            ds.repo.add_url_to_file(
                                f,
                                u,
                                options=annex_add_opts,  # TODO: see above
                                git_options=git_opts,
                                annex_options=annex_opts,
                                batch=True))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_f']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_f']])
            return_values = None  # to avoid mis-use

        # XXX or we could return entire datasets_return_values, could be useful
        # that way.  But then should be unified with the rest of commands, e.g.
        # get etc
        return_values_flat = []
        for dspath, return_values in datasets_return_values.items():
            if save and len(return_values):
                # we got something added -> save
                # everything we care about at this point should be staged already
                Save.__call__(message='[DATALAD] added content',
                              dataset=ds,
                              auto_add_changes=False,
                              recursive=False)
            # TODO: you feels that this is some common logic we already have somewhere
            dsrelpath = relpath(dspath, dataset.path)
            if dsrelpath != curdir:
                # we need ot adjust 'file' entry in each record
                for return_value in return_values:
                    if 'file' in return_value:
                        return_value['file'] = opj(dsrelpath,
                                                   return_value['file'])
                    return_values_flat.append(return_value)
            else:
                return_values_flat.extend(return_values)

        return return_values_flat

Example #12

Show file

File: uninstall.py Project: silky/datalad

    def __call__(
            path=None,
            dataset=None,
            remove_data=True,
            remove_handles=False,
            recursive=False,
            remove_history=False,
            check=True,
            kill=False,
            if_dirty='save-before'):

        # upfront check prior any resolution attempt to avoid disaster
        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient information for uninstallation (needs at "
                "least a dataset or a path. To uninstall an entire dataset "
                "it needs to be given explicitly.")

        if remove_history and not remove_handles:
            raise ValueError("`remove_history` flag, requires `remove_handles` flag")

        if not remove_data and not remove_handles:
            raise ValueError("instructed to neither drop data, nor remove handles: cannot perform")

        path, dataset_path = get_normalized_path_arguments(
            path, dataset, default=curdir)

        results = []

        if kill:
            lgr.warning("Force-removing %d paths", len(path))
            for p in path:
                rmtree(p)
                results.append(p)
            return results

        ds = require_dataset(
            dataset, check_installed=True, purpose='uninstall')
        # make sure we get to an expected state
        handle_dirty_dataset(ds, if_dirty)

        # sort paths into the respective datasets that contain them
        # considering 1st-level subdatasets at most
        # NOTE: little dance with two dicts is necessary, because ATM our
        # Datasets are not hashable enough for PY3
        whocares_paths = {}
        whocares_ds = {}
        pwd = getpwd()
        for p in path:
            if remove_handles:
                # behave like `rm -r` and refuse to remove where we are
                rpath = relpath(p, start=pwd)
                if rpath == os.curdir \
                        or rpath == os.pardir \
                        or set(psplit(rpath)) == {os.pardir}:
                    raise ValueError(
                        "refusing to remove current or parent directory")
            containerds = ds.get_containing_subdataset(p, recursion_limit=1)
            if not recursive and containerds.path != ds.path:
                raise ValueError(
                    "will not uninstall content in subdatasets without the recursive flag")
            ps = whocares_paths.get(containerds.path, [])
            ps.append(p)
            whocares_paths[containerds.path] = ps
            whocares_ds[containerds.path] = containerds

        ds_gonealready = False
        if ds.path in whocares_paths:
            # start with the content of this dataset, as any somewhat
            # total recursive removal here would have most impact
            lgr.debug("Uninstall content in {}".format(ds))
            res, ds_gonealready = _uninstall(
                whocares_ds[ds.path],
                whocares_paths[ds.path],
                check=check,
                remove_history=remove_history,
                remove_data=remove_data,
                remove_handles=remove_handles,
                recursive=recursive)
            results.extend(res)

        if ds_gonealready:
            rmtree(ds.path)
            # the underlying repo is gone, the assert makes sure that the Dataset
            # instance becomes aware of that
            assert(not ds.is_installed())
            return results

        # otherwise deal with any other subdataset
        for subdspath in whocares_paths:
            subds = whocares_ds[subdspath]
            subdsrelpath = relpath(subdspath, start=ds.path)
            if subds == ds:
                continue
            res, subds_gone = _uninstall(
                subds,
                whocares_paths[subdspath],
                check=check,
                remove_history=remove_history,
                remove_data=remove_data,
                remove_handles=remove_handles,
                recursive=recursive)
            results.extend(res)

            if subds_gone:
                # clean divorce, if we lost the subds in the process
                # find the submodule that matches the patch
                # regular access goes by name, but we cannot trust
                # our own consistency, yet
                submodule = [sm for sm in ds.repo.repo.submodules
                             if sm.path == subdsrelpath][0]
                submodule.remove()
            elif remove_handles:
                # we could have removed handles -> save
                Save.__call__(
                    message='[DATALAD] uninstalled content',
                    dataset=subds,
                    auto_add_changes=False,
                    recursive=False)
                # add this change to the parent, but don't save, will do in
                # one go below
                ds.repo.add(subdsrelpath, git=True)

        if remove_handles:
            # something of the original dataset is left at this point
            # and all subdatasets have been saved already
            # -> save changes
            Save.__call__(
                message='[DATALAD] uninstalled content',
                dataset=ds,
                auto_add_changes=False,
                recursive=False)

        return results

Example #13

Show file

def test_interface_prep():
    # verify sanity if nothing was given, as it would look like from the
    # cmdline
    assert_equal(Save._prep(path=[], dataset=None), ({}, []))

Example #14

Show file

File: aggregate.py Project: hanke/datalad

    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = assure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res

Example #15

Show file

File: create.py Project: silky/datalad

    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 if_dirty='save-before',
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)

        # straight from input arg, no messing around before this
        if path is None:
            if dataset is None:
                # nothing given explicity, assume create fresh right here
                path = getpwd()
            else:
                # no path, but dataset -> create that dataset
                path = dataset.path
        else:
            # resolve the path against a potential dataset
            path = resolve_path(path, ds=dataset)

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # check for sane subdataset path
        real_targetpath = with_pathsep(realpath(path))  # realpath OK
        if dataset is not None:
            # make sure we get to an expected state
            if dataset.is_installed():
                handle_dirty_dataset(dataset, if_dirty)
            if not real_targetpath.startswith(  # realpath OK
                    with_pathsep(realpath(dataset.path))):  # realpath OK
                raise ValueError("path {} outside {}".format(path, dataset))

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if dataset is not None and dataset.path == path else Dataset(
            path)

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            raise ValueError("Cannot create dataset in directory %s "
                             "(not empty). Use option 'force' in order to "
                             "ignore this and enforce creation." % tbds.path)

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            AnnexRepo(tbds.path,
                      url=None,
                      create=True,
                      backend=annex_backend,
                      version=annex_version,
                      description=description,
                      git_opts=git_opts,
                      annex_opts=annex_opts,
                      annex_init_opts=annex_init_opts)

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else
                        uuid.uuid1().urn.split(':')[-1],
                        where='dataset')

        # save everthing
        tbds.repo.add('.datalad', git=True)

        if save:
            Save.__call__(message='[DATALAD] new dataset',
                          dataset=tbds,
                          auto_add_changes=False,
                          recursive=False)

        if dataset is not None and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            from datalad.distribution.utils import _install_subds_inplace
            subdsrelpath = relpath(realpath(tbds.path),
                                   realpath(dataset.path))  # realpath OK
            _install_subds_inplace(ds=dataset,
                                   path=tbds.path,
                                   relativepath=subdsrelpath)
            # this will have staged the changes in the superdataset already
            if save:
                Save.__call__(message='[DATALAD] added subdataset',
                              dataset=dataset,
                              auto_add_changes=False,
                              recursive=False)

        return tbds