Example #1
0
def diff_dataset(dataset,
                 fr,
                 to,
                 constant_refs,
                 path=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_file_type=True,
                 reporting_order='depth-first'):
    """Internal helper to diff a dataset

    Parameters
    ----------
    dataset : Dataset
      Dataset to perform the diff on. `fr` and `to` parameters are interpreted
      in the context of this dataset.
    fr : str
      Commit-ish to compare from.
    to : str
      Commit-ish to compare to.
    constant_refs : bool
      If True, `fr` and `to` will be passed on unmodified to diff operations
      on subdatasets. This can be useful with symbolic references like tags
      to report subdataset changes independent of superdataset changes.
      If False, `fr` and `to` will be translated to the subdataset commit-ish
      that match the given commit-ish in the superdataset.
    path : Path-like, optional
      Paths to constrain the diff to (see main diff() command).
    annex : str, optional
      Reporting mode for annex properties (see main diff() command).
    untracked : str, optional
      Reporting mode for untracked content (see main diff() command).
    recursive : bool, optional
      Flag to enable recursive operation (see main diff() command).
    recursion_limit : int, optional
      Recursion limit (see main diff() command).
    eval_file_type : bool, optional
      Whether to perform file type discrimination between real symlinks
      and symlinks representing annex'ed files. This can be expensive
      in datasets with many files.
    reporting_order : {'depth-first', 'breadth-first'}, optional
      By default, subdataset content records are reported after the record
      on the subdataset's submodule in a superdataset (depth-first).
      Alternatively, report all superdataset records first, before reporting
      any subdataset content records (breadth-first).

    Yields
    ------
    dict
      DataLad result records.
    """
    if reporting_order not in ('depth-first', 'breadth-first'):
        raise ValueError('Unknown reporting order: {}'.format(reporting_order))

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='difference reporting')

    # we cannot really perform any sorting of paths into subdatasets
    # or rejecting paths based on the state of the filesystem, as
    # we need to be able to compare with states that are not represented
    # in the worktree (anymore)
    if path:
        ps = []
        # sort any path argument into the respective subdatasets
        for p in sorted(assure_list(path)):
            # it is important to capture the exact form of the
            # given path argument, before any normalization happens
            # distinguish rsync-link syntax to identify
            # a dataset as whole (e.g. 'ds') vs its
            # content (e.g. 'ds/')
            # special case is the root dataset, always report its content
            # changes
            orig_path = str(p)
            resolved_path = resolve_path(p, dataset)
            p = \
                resolved_path, \
                orig_path.endswith(op.sep) or resolved_path == ds.pathobj
            str_path = str(p[0])
            root = get_dataset_root(str_path)
            if root is None:
                # no root, not possibly underneath the refds
                yield dict(action='status',
                           path=str_path,
                           refds=ds.path,
                           status='error',
                           message='path not underneath this dataset',
                           logger=lgr)
                continue
            if path_under_rev_dataset(ds, str_path) is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=str_path,
                    refds=ds.path,
                    action='diff',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str_path),
                    logger=lgr,
                )
                continue

            ps.append(p)
        path = ps

    # TODO we might want to move away from the single-pass+immediate-yield
    # paradigm for this command. If we gather all information first, we
    # could do post-processing and detect when a file (same gitsha, or same
    # key) was copied/moved from another dataset. Another command (e.g.
    # save) could act on this information and also move/copy
    # availability information or at least enhance the respective commit
    # message with cross-dataset provenance info

    # cache to help avoid duplicate status queries
    content_info_cache = {}
    for res in _diff_ds(
            ds,
            fr,
            to,
            constant_refs,
            recursion_limit if recursion_limit is not None and recursive else
            -1 if recursive else 0,
            # TODO recode paths to repo path reference
            origpaths=None if not path else OrderedDict(path),
            untracked=untracked,
            annexinfo=annex,
            eval_file_type=eval_file_type,
            cache=content_info_cache,
            order=reporting_order):
        res.update(
            refds=ds.path,
            logger=lgr,
            action='diff',
        )
        yield res
Example #2
0
    def __call__(path=None,
                 initopts=None,
                 *,
                 force=False,
                 description=None,
                 dataset=None,
                 annex=True,
                 fake_dates=False,
                 cfg_proc=None):
        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple))
                and '--bare' in initopts) or (isinstance(initopts, dict)
                                              and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = ensure_list(cfg_proc)

        # prep for yield
        res = dict(action='create',
                   path=str(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='create a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset"
                    and any(check_path == p or check_path in p.parents
                            for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     str(parentds_path), [str(c) for c in conflict])
                })
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'
                }
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status':
                        'error',
                        'message':
                        ('collision with %s (dataset) in dataset %s',
                         str(conflict[0]), str(parentds_path))
                    })
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`--force` option to ignore'
            })
            yield res
            return

        # Check if specified cfg_proc(s) can be discovered, storing
        # the results so they can be used when the time comes to run
        # the procedure. If a procedure cannot be found, raise an
        # error to prevent creating the dataset.
        cfg_proc_specs = []
        if cfg_proc:
            discovered_procs = tbds.run_procedure(
                discover=True,
                result_renderer='disabled',
                return_type='generator',
            )
            for cfg_proc_ in cfg_proc:
                for discovered_proc in discovered_procs:
                    if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_:
                        cfg_proc_specs.append(discovered_proc)
                        break
                else:
                    raise ValueError("Cannot find procedure with name "
                                     "'%s'" % cfg_proc_)

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        # also provides initial set of content to be tracked with git (not annex)
        if no_annex:
            tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates)
        else:
            tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates,
                                                   description)

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, scope='branch')

        if _seed is None:
            # just the standard way
            # use a fully random identifier (i.e. UUID version 4)
            uuid_id = str(uuid.uuid4())
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        scope='branch',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, scope='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_spec in cfg_proc_specs:
            yield from tbds.run_procedure(
                cfg_proc_spec,
                result_renderer='disabled',
                return_type='generator',
            )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            yield from refds.save(
                path=tbds.path,
                return_type='generator',
                result_renderer='disabled',
            )

        res.update({'status': 'ok'})
        yield res
Example #3
0
    def __call__(path=None,
                 initopts=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 fake_dates=False,
                 cfg_proc=None):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        res = dict(action='create',
                   path=text_type(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", dataset,
                        text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(check_path == p or check_path in p.parents
                   for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     text_type(parentds_path),
                     [text_type(c) for c in conflict])
                })
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'
            }
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with %s (dataset) in dataset %s',
                     text_type(conflict[0]), text_type(parentds_path))
                })
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(tbds.path,
                             url=None,
                             create=True,
                             create_sanity_checks=False,
                             git_opts=initopts,
                             fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates)
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                                       persistent=True,
                                       commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'
            }
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
                'metadata/aggregate*', 'annex.largefiles', 'nothing'
            ), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(set_attrs,
                                            attrfile=op.join(
                                                '.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get('annex.largefiles',
                                             None) == 'nothing':
                tbds.repo.set_gitattributes([('**/.git*', {
                    'annex.largefiles': 'nothing'
                })])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'
                }

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        where='dataset',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(path=tbds.path, ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r
Example #4
0
    def __call__(path=None,
                 dataset=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_subdataset_state='full'):
        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a contraint (e.g. to figure out which subdataset a path is in)
        # `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='status reporting')

        paths_by_ds = OrderedDict()
        if path:
            # sort any path argument into the respective subdatasets
            for p in sorted(assure_list(path)):
                # it is important to capture the exact form of the
                # given path argument, before any normalization happens
                # for further decision logic below
                orig_path = text_type(p)
                p = rev_resolve_path(p, dataset)
                root = rev_get_dataset_root(text_type(p))
                if root is None:
                    # no root, not possibly underneath the refds
                    yield dict(action='status',
                               path=p,
                               refds=ds.path,
                               status='error',
                               message='path not underneath this dataset',
                               logger=lgr)
                    continue
                else:
                    if dataset and root == text_type(p) and \
                            not (orig_path.endswith(op.sep) or
                                 orig_path == "."):
                        # the given path is pointing to a dataset
                        # distinguish rsync-link syntax to identify
                        # the dataset as whole (e.g. 'ds') vs its
                        # content (e.g. 'ds/')
                        super_root = rev_get_dataset_root(op.dirname(root))
                        if super_root:
                            # the dataset identified by the path argument
                            # is contained in a superdataset, and no
                            # trailing path separator was found in the
                            # argument -> user wants to address the dataset
                            # as a whole (in the superdataset)
                            root = super_root

                root = ut.Path(root)
                ps = paths_by_ds.get(root, [])
                ps.append(p)
                paths_by_ds[root] = ps
        else:
            paths_by_ds[ds.pathobj] = None

        queried = set()
        content_info_cache = {}
        while paths_by_ds:
            qdspath, qpaths = paths_by_ds.popitem(last=False)
            if qpaths and qdspath in qpaths:
                # this is supposed to be a full query, save some
                # cycles sifting through the actual path arguments
                qpaths = []
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            qds_inrefds = path_under_rev_dataset(ds, qdspath)
            if qds_inrefds is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=text_type(qdspath),
                    refds=ds.path,
                    action='status',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, qpaths),
                    logger=lgr,
                )
                continue
            elif qds_inrefds != qdspath:
                # the path this dataset was located by is not how it would
                # be referenced underneath the refds (possibly resolved
                # realpath) -> recode all paths to be underneath the refds
                qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths]
                qdspath = qds_inrefds
            if qdspath in queried:
                # do not report on a single dataset twice
                continue
            qds = Dataset(text_type(qdspath))
            for r in _yield_status(
                    qds, qpaths, annex, untracked, recursion_limit
                    if recursion_limit is not None else -1 if recursive else 0,
                    queried, eval_subdataset_state, content_info_cache):
                yield dict(
                    r,
                    refds=ds.path,
                    action='status',
                    status='ok',
                )
Example #5
0
    def __call__(urls,
                 *,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.http import HTTPDownloader
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='download urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or str(path).endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        need_datalad_remote = False
        for url in urls:
            # somewhat "ugly"
            downloader = providers.get_provider(url).get_downloader(url)
            try:
                downloaded_path = downloader.download(url,
                                                      path=path,
                                                      overwrite=overwrite)
            except Exception as e:
                ce = CapturedException(e)
                yield get_status_dict(status="error",
                                      message=str(ce),
                                      type="file",
                                      path=path,
                                      exception=ce,
                                      **common_report)
            else:
                if not need_datalad_remote \
                   and (downloader.authenticator or downloader.credential or
                        type(downloader) != HTTPDownloader):
                    need_datalad_remote = True
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_renderer='disabled',
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            ds_repo = ds.repo
            if isinstance(ds_repo, AnnexRepo):
                if need_datalad_remote:
                    from datalad.customremotes.base import (
                        ensure_datalad_remote, )
                    ensure_datalad_remote(ds_repo,
                                          autoenable=True,
                                          encryption=None)

                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds_repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds_repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, CapturedException(exc))

                    if archive:
                        for path in annex_paths:
                            yield from ds.add_archive_content(
                                path,
                                delete=True,
                                on_failure='ignore',
                                return_type='generator',
                                result_renderer='disabled')
Example #6
0
def get_paths_by_ds(refds, dataset_arg, paths, subdsroot_mode='rsync'):
    """Resolve and sort any paths into their containing datasets

    Any path will be associated (sorted into) its nearest containing dataset.
    It is irrelevant whether or not a path presently exists on the file system.
    However, only datasets that exist on the file system are used for
    sorting/association -- known, but non-existent subdatasets are not
    considered.

    Parameters
    ----------
    refds: Dataset
    dataset_arg: Dataset or str or Path or None
      Any supported value given to a command's `dataset` argument. Given
      to `resolve_path()`.
    paths: list
      Any number of absolute or relative paths, in str-form or as
      Path instances, to be sorted into their respective datasets. See also
      the `subdsroot_mode` parameter.
    subdsroot_mode: {'rsync', 'super', 'sub'}
      Switch behavior for paths that are the root of a subdataset. By default
      ('rsync'), such a path is associated with its parent/superdataset,
      unless the path ends with a trailing directory separator, in which case
      it is sorted into the subdataset record (this resembles the path
      semantics of rsync, hence the label). In 'super' mode, the path is always
      placed with the superdataset record. Likewise, in 'sub' mode the path
      is always placed into the subdataset record.

    Returns
    -------
    dict, list
      The first return value is the main result, a dictionary with root
      directories of all discovered datasets as keys and a list of the
      associated paths inside these datasets as values.  Keys and values are
      normalized to be Path instances of absolute paths.
      The second return value is a list of all paths (again Path instances)
      that are not located underneath the reference dataset.
    """
    ds_path = refds.path
    paths_by_ds = OrderedDict()
    errors = []

    if not paths:
        # that was quick
        paths_by_ds[refds.pathobj] = None
        return paths_by_ds, errors

    # in order to guarantee proper path sorting, we first need to resolve all
    # of them (some may be str, some Path, some relative, some absolute)
    # step 1: normalize to unicode
    paths = map(ensure_unicode, paths)
    # step 2: resolve
    # for later comparison, we need to preserve the original value too
    paths = [(resolve_path(p, dataset_arg), str(p)) for p in paths]
    # OPT: store cache for dataset roots for each directory directly
    #      listed in paths, or containing the path (if file)
    roots_cache = {}
    # sort any path argument into the respective subdatasets
    # sort by comparing the resolved Path instances, this puts top-level
    # paths first, leading to their datasets to be injected into the result
    # dict first
    for p, orig_path in sorted(paths, key=lambda x: x[0]):
        # TODO (left from implementing caching OPT):
        # Logic here sounds duplicated with discover_dataset_trace_to_targets
        # and even get_tree_roots of save.
        str_p = str(p)

        # query get_dataset_root caching for repeated queries within the same
        # directory
        if p.is_dir():
            p_dir = str(p)
        else:  # symlink, file, whatnot - seems to match logic in get_dataset_root
            p_dir = str(p.parent)

        try:
            root = roots_cache[p_dir]
        except KeyError:
            root = roots_cache[p_dir] = get_dataset_root(p_dir)

        # to become the root of the dataset that contains the path in question
        # in the context of (same basepath) as the reference dataset
        qds_inrefds = None
        if root is not None:
            qds_inrefds = path_under_rev_dataset(refds, root)
        if root is None or qds_inrefds is None:
            # no root, not possibly underneath the refds
            # or root that is not underneath/equal the reference dataset root
            errors.append(p)
            continue

        if root != qds_inrefds:
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            # the path this dataset was located by is not how it would
            # be referenced underneath the refds (possibly resolved
            # realpath) -> recode all paths to be underneath the refds
            p = qds_inrefds / p.relative_to(root)
            root = qds_inrefds

        # Note: Compare to Dataset(root).path rather
        # than root to get same path normalization.
        if root == str_p and not Dataset(root).path == ds_path and (
                subdsroot_mode == 'super' or
            (subdsroot_mode == 'rsync' and dataset_arg
             and not orig_path.endswith(op.sep))):
            # the given path is pointing to a subdataset
            # and we are either in 'super' mode, or in 'rsync' and found
            # rsync-link syntax to identify the dataset as whole
            # (e.g. 'ds') vs its content (e.g. 'ds/')
            root_dir = op.dirname(root)
            try:
                super_root = roots_cache[root_dir]
            except KeyError:
                super_root = roots_cache[root_dir] = get_dataset_root(root_dir)
            if super_root:
                # the dataset identified by the path argument
                # is contained in a superdataset, and no
                # trailing path separator was found in the
                # argument -> user wants to address the dataset
                # as a whole (in the superdataset)
                root = super_root

        root = ut.Path(root)
        ps = paths_by_ds.get(root, [])
        ps.append(p)
        paths_by_ds[root] = ps
    return paths_by_ds, errors
Example #7
0
    def __call__(
            path=None,
            dataset=None,
            annex=None,
            untracked='normal',
            recursive=False,
            recursion_limit=None,
            eval_subdataset_state='full'):
        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a contraint (e.g. to figure out which subdataset a path is in)
        # `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(
            dataset, check_installed=True, purpose='status reporting')

        paths_by_ds = OrderedDict()
        if path:
            # sort any path argument into the respective subdatasets
            for p in sorted(assure_list(path)):
                # it is important to capture the exact form of the
                # given path argument, before any normalization happens
                # for further decision logic below
                orig_path = text_type(p)
                p = rev_resolve_path(p, dataset)
                root = rev_get_dataset_root(text_type(p))
                if root is None:
                    # no root, not possibly underneath the refds
                    yield dict(
                        action='status',
                        path=p,
                        refds=ds.path,
                        status='error',
                        message='path not underneath this dataset',
                        logger=lgr)
                    continue
                else:
                    if dataset and root == text_type(p) and \
                            not (orig_path.endswith(op.sep) or
                                 orig_path == "."):
                        # the given path is pointing to a dataset
                        # distinguish rsync-link syntax to identify
                        # the dataset as whole (e.g. 'ds') vs its
                        # content (e.g. 'ds/')
                        super_root = rev_get_dataset_root(op.dirname(root))
                        if super_root:
                            # the dataset identified by the path argument
                            # is contained in a superdataset, and no
                            # trailing path separator was found in the
                            # argument -> user wants to address the dataset
                            # as a whole (in the superdataset)
                            root = super_root

                root = ut.Path(root)
                ps = paths_by_ds.get(root, [])
                ps.append(p)
                paths_by_ds[root] = ps
        else:
            paths_by_ds[ds.pathobj] = None

        queried = set()
        content_info_cache = {}
        while paths_by_ds:
            qdspath, qpaths = paths_by_ds.popitem(last=False)
            if qpaths and qdspath in qpaths:
                # this is supposed to be a full query, save some
                # cycles sifting through the actual path arguments
                qpaths = []
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            qds_inrefds = path_under_rev_dataset(ds, qdspath)
            if qds_inrefds is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=text_type(qdspath),
                    refds=ds.path,
                    action='status',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, qpaths),
                    logger=lgr,
                )
                continue
            elif qds_inrefds != qdspath:
                # the path this dataset was located by is not how it would
                # be referenced underneath the refds (possibly resolved
                # realpath) -> recode all paths to be underneath the refds
                qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths]
                qdspath = qds_inrefds
            if qdspath in queried:
                # do not report on a single dataset twice
                continue
            qds = Dataset(text_type(qdspath))
            for r in _yield_status(
                    qds,
                    qpaths,
                    annex,
                    untracked,
                    recursion_limit
                    if recursion_limit is not None else -1
                    if recursive else 0,
                    queried,
                    eval_subdataset_state,
                    content_info_cache):
                yield dict(
                    r,
                    refds=ds.path,
                    action='status',
                    status='ok',
                )
Example #8
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            fake_dates=False,
            cfg_proc=None
    ):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # prep for yield
        res = dict(action='create', path=text_type(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        dataset, text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(
                    check_path == p or check_path in p.parents
                    for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        text_type(parentds_path),
                        [text_type(c) for c in conflict])})
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'}
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with %s (dataset) in dataset %s',
                        text_type(conflict[0]),
                        text_type(parentds_path))})
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbds.repo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r