Beispiel #1
0
def test_add_files(path):
    ds = Dataset(path).create(force=True)

    test_list_1 = ['test_annex.txt']
    test_list_2 = ['test.txt']
    test_list_3 = ['test1.dat', 'test2.dat']
    test_list_4 = [
        op.join('dir', 'testindir'),
        op.join('dir', OBSCURE_FILENAME)
    ]

    for arg in [(test_list_1[0], False), (test_list_2[0], True),
                (test_list_3, False), (test_list_4, False)]:
        # special case 4: give the dir:
        if arg[0] == test_list_4:
            result = ds.save('dir', to_git=arg[1])
            status = ds.repo.annexstatus(['dir'])
        else:
            result = ds.save(arg[0], to_git=arg[1])
            for a in assure_list(arg[0]):
                assert_result_count(result, 1, path=text_type(ds.pathobj / a))
            status = ds.repo.get_content_annexinfo(
                ut.Path(p) for p in assure_list(arg[0]))
        for f, p in iteritems(status):
            if arg[1]:
                assert p.get('key', None) is None, f
            else:
                assert p.get('key', None) is not None, f
Beispiel #2
0
def test_add_files(path):
    ds = Dataset(path).create(force=True)

    test_list_1 = ['test_annex.txt']
    test_list_2 = ['test.txt']
    test_list_3 = ['test1.dat', 'test2.dat']
    test_list_4 = [op.join('dir', 'testindir'),
                   op.join('dir', OBSCURE_FILENAME)]

    for arg in [(test_list_1[0], False),
                (test_list_2[0], True),
                (test_list_3, False),
                (test_list_4, False)]:
        # special case 4: give the dir:
        if arg[0] == test_list_4:
            result = ds.save('dir', to_git=arg[1])
            status = ds.repo.annexstatus(['dir'])
        else:
            result = ds.save(arg[0], to_git=arg[1])
            for a in assure_list(arg[0]):
                assert_result_count(result, 1, path=text_type(ds.pathobj / a))
            status = ds.repo.get_content_annexinfo(
                ut.Path(p) for p in assure_list(arg[0]))
        for f, p in iteritems(status):
            if arg[1]:
                assert p.get('key', None) is None, f
            else:
                assert p.get('key', None) is not None, f
Beispiel #3
0
def _get_procedure_implementation(name='*', ds=None):
    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None
    # 1. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m in _get_file_match(op.join(ds.path, dir), name):
                yield m
    # 2. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m in _get_file_match(dir, name):
                yield m
    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m in _get_file_match(
                    resource_filename(
                        entry_point.module_name,
                        'resources/procedures'),
                    name):
                yield m
    # 4. at last check datalad itself for procedure
    for m in _get_file_match(
            resource_filename('datalad', 'resources/procedures'),
            name):
        yield m
Beispiel #4
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedure path and configuration

    Order of consideration is user-level, system-level, dataset,
    datalad extensions, datalad. First one found according to this order is the
    one to be returned. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you do execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Returns
    -------
    tuple
      path, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (m, ) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(
            ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (m, ) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, f, h in _get_procedure_implementation(name=name, ds=subds):
                yield m, f, h

    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(entry_point.module_name,
                                      'resources/procedures'), name):
                yield (m, ) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'), name):
        yield (m, ) + _get_proc_config(n)
Beispiel #5
0
def save_dataset_hierarchy(
        info,
        base=None,
        message='[DATALAD] saved changes',
        version_tag=None):
    """Save (disjoint) hierarchies of datasets.

    Saving is done in an order that guarantees that all to be saved
    datasets reflect any possible change of any other to be saved
    subdataset, before they are saved themselves.

    Parameters
    ----------
    info : dict
      Absolute paths of datasets to be saved are the keys, and paths in each
      dataset to be saved are the values
    base : path or None, optional
      Common super dataset that should also be saved.
    message : str
      Message to be used for saving individual datasets

    Returns
    -------
    list
      Instances of saved datasets, in the order in which they where saved.
    """
    if not isinstance(info, dict):
        info = assure_list(info)
        info = dict(zip(info, [[i] for i in info]))
    dpaths = info.keys()
    if base:
        # just a convenience...
        dpaths = assure_list(dpaths)
        dpaths.append(base.path if isinstance(base, Dataset) else base)
    # sort all datasets under their potential superdatasets
    # start from the top to get all subdatasets down the line
    # and collate them into as few superdatasets as possible
    superdss = get_tree_roots(dpaths)
    # for each "superdataset" check the tree of subdatasets and make sure
    # we gather all datasets between the super and any subdataset
    # so we can save them all bottom-up in order to be able to properly
    # save the superdataset
    for superds_path in superdss:
        target_subs = superdss[superds_path]
        sort_paths_into_subdatasets(superds_path, target_subs, info)
    # iterate over all datasets, starting at the bottom
    saved = []
    for dpath in sorted(info.keys(), reverse=True):
        ds = Dataset(dpath)
        if ds.is_installed():
            saved_state = save_dataset(
                ds,
                info[dpath],
                message=message,
                version_tag=version_tag)
            if saved_state:
                saved.append(ds)
    return saved
Beispiel #6
0
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [
                rev_resolve_path(c, dataset) for c in assure_list(contains)
            ]
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #7
0
    def __call__(
            path=None,
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [rev_resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [rev_resolve_path(c, dataset) for c in assure_list(contains)]
        for r in _get_submodules(
                ds, paths, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = text_type(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
Beispiel #8
0
    def get_query(self, query):
        query = assure_list(query)
        simple_fieldspec = re.compile(r"(?P<field>\S*?):(?P<query>.*)")
        quoted_fieldspec = re.compile(r"'(?P<field>[^']+?)':(?P<query>.*)")
        query_rec_matches = [
            simple_fieldspec.match(q) or
            quoted_fieldspec.match(q) or
            q
            for q in query]
        query_group_dicts_only = [
            q.groupdict() for q in query_rec_matches if hasattr(q, 'groupdict')
        ]
        self._queried_keys = [
            qgd['field']
            for qgd in query_group_dicts_only
            if ('field' in qgd and qgd['field'])
        ]
        if len(query_group_dicts_only) != len(query_rec_matches):
            # we had a query element without field specification add
            # None as an indicator of that
            self._queried_keys.append(None)
        # expand matches, compile expressions
        query = [
            {k: re.compile(self._xfm_query(v)) for k, v in q.groupdict().items()}
            if hasattr(q, 'groupdict') else re.compile(self._xfm_query(q))
            for q in query_rec_matches
        ]

        # turn "empty" field specs into simple queries
        # this is used to forcibly disable field-based search
        # e.g. when searching for a value
        query = [q['query']
                 if isinstance(q, dict) and q['field'].pattern == '' else q
                 for q in query]
        return query
Beispiel #9
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        datasets = assure_list(
            install(path, source=src, recursive=True, recursion_limit=depth))
        # we expect one dataset per level
        eq_(len(datasets), depth + 1)
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    eq_(len(files), 1)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(curdir, recursive=True, get_data=False)
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(curdir, recursive=True)
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Beispiel #10
0
def get_normalized_path_arguments(paths, dataset=None, default=None):
    """Apply standard resolution to path arguments

    This is nothing more than a helper to standardize path argument
    preprocessing.

    Parameter
    ---------
    paths : sequence or single path
      Path(s) to normalize
    dataset : path or Dataset or None
      Optional dataset identifying something against which to resolve input
      path arguments
    default: sequence of paths or single path or None
      If `paths` is empty, use this instead

    Returns
    -------
    tuple(list(paths), path)
      Normalized paths and path to a potential dataset against which paths were
      resolved.
    """
    dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset
    if not paths and default:
        paths = default
    paths = assure_list(paths)
    # resolve path(s):
    resolved_paths = [resolve_path(p, dataset) for p in paths]
    if dataset:
        # guarantee absolute paths
        resolved_paths = [opj(dataset_path, p) for p in resolved_paths]
    lgr.debug('Resolved input path arguments: %s', resolved_paths)
    return resolved_paths, dataset_path
Beispiel #11
0
def test_recurse_existing(src, path):
    origin_ds = _make_dataset_hierarchy(src)

    # make sure recursion_limit works as expected across a range of depths
    for depth in range(len(origin_ds)):
        datasets = assure_list(
            install(path, source=src, recursive=True, recursion_limit=depth))
        # we expect one dataset per level
        eq_(len(datasets), depth + 1)
        rmtree(path)

    # now install all but the last two levels, no data
    root, sub1, sub2 = install(path, source=src, recursive=True, recursion_limit=2)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is False)
    sub3 = Dataset(opj(sub2.path, 'sub3'))
    ok_(not sub3.is_installed())
    # now get all content in all existing datasets, no new datasets installed
    # in the process
    files = root.get(curdir, recursive=True, recursion_limit='existing')
    eq_(len(files), 1)
    ok_(sub2.repo.file_has_content('file_in_annex.txt') is True)
    ok_(not sub3.is_installed())
    # now pull down all remaining datasets, no data
    sub3, sub4 = root.get(curdir, recursive=True, get_data=False)
    ok_(sub4.is_installed())
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is False)
    # aaannd all data
    files = root.get(curdir, recursive=True)
    eq_(len(files), 1)
    ok_(sub3.repo.file_has_content('file_in_annex.txt') is True)
Beispiel #12
0
    def close(self, allow_fail=True, ctrl_path=None):
        """Closes all connections, known to this instance.

        Parameters
        ----------
        allow_fail: bool, optional
          If True, swallow exceptions which might be thrown during
          connection.close, and just log them at DEBUG level
        ctrl_path: str or list of str, optional
          If specified, only the path(s) provided would be considered
        """
        if self._connections:
            from datalad.utils import assure_list
            ctrl_paths = assure_list(ctrl_path)
            to_close = [
                c for c in self._connections
                # don't close if connection wasn't opened by SSHManager
                if self._connections[c].ctrl_path not in self._prev_connections
                and self._connections[c].ctrl_path.exists() and
                (not ctrl_paths or self._connections[c].ctrl_path in ctrl_paths
                 )
            ]
            if to_close:
                lgr.debug("Closing %d SSH connections..." % len(to_close))
            for cnct in to_close:
                f = self._connections[cnct].close
                if allow_fail:
                    f()
                else:
                    try:
                        f()
                    except Exception as exc:
                        lgr.debug("Failed to close a connection: "
                                  "%s", exc_str(exc))
            self._connections = dict()
Beispiel #13
0
def results_from_paths(paths,
                       action=None,
                       type=None,
                       logger=None,
                       refds=None,
                       status=None,
                       message=None):
    """
    Helper to yield analog result dicts for each path in a sequence.

    Parameters
    ----------
    message: str
      A result message. May contain `%s` which will be replaced by the
      respective `path`.

    Returns
    -------
    generator

    """
    for p in assure_list(paths):
        yield get_status_dict(action,
                              path=p,
                              type=type,
                              logger=logger,
                              refds=refds,
                              status=status,
                              message=(message,
                                       p) if '%s' in message else message)
Beispiel #14
0
    def close(self, allow_fail=True, ctrl_path=None):
        """Closes all connections, known to this instance.

        Parameters
        ----------
        allow_fail: bool, optional
          If True, swallow exceptions which might be thrown during
          connection.close, and just log them at DEBUG level
        ctrl_path: str or list of str, optional
          If specified, only the path(s) provided would be considered
        """
        if self._connections:
            from datalad.utils import assure_list
            ctrl_paths = assure_list(ctrl_path)
            to_close = [c for c in self._connections
                        # don't close if connection wasn't opened by SSHManager
                        if self._connections[c].ctrl_path
                        not in self._prev_connections and
                        exists(self._connections[c].ctrl_path)
                        and (not ctrl_paths
                             or self._connections[c].ctrl_path in ctrl_paths)]
            if to_close:
                lgr.debug("Closing %d SSH connections..." % len(to_close))
            for cnct in to_close:
                f = self._connections[cnct].close
                if allow_fail:
                    f()
                else:
                    try:
                        f()
                    except Exception as exc:
                        lgr.debug("Failed to close a connection: "
                                  "%s", exc_str(exc))
            self._connections = dict()
Beispiel #15
0
def is_result_matching_pathsource_argument(res, **kwargs):
    # we either have any non-zero number of "paths" (that could be anything), or
    # we have one path and one source
    # we don't do any error checking here, done by the command itself
    source = kwargs.get('source', None)
    if source is not None:
        # if there was a source, it needs to be recorded in the result
        # otherwise this is not what we are looking for
        return source == res.get('source_url', None)
    # the only thing left is a potentially heterogeneous list of paths/URLs
    paths = assure_list(kwargs.get('path', []))
    # three cases left:
    # 1. input arg was an absolute path -> must match 'path' property
    # 2. input arg was relative to a dataset -> must match refds/relpath
    # 3. something nifti with a relative input path that uses PWD as the
    #    reference
    respath = res.get('path', None)
    if respath in paths:
        # absolute match, pretty sure we want this
        return True
    elif kwargs.get('dataset', None) and YieldRelativePaths()(res) in paths:
        # command was called with a reference dataset, and a relative
        # path of a result matches in input argument -- not 100% exhaustive
        # test, but could be good enough
        return True
    elif any(abspath(p) == respath for p in paths):
        # one absolutified input path matches the result path
        # I'd say: got for it!
        return True
    elif any(p == res.get('source_url', None) for p in paths):
        # this was installed from a URL that was given, we'll take that too
        return True
    else:
        False
Beispiel #16
0
 def get_query(self, query):
     # parse the query string
     self._mk_parser()
     # for convenience we accept any number of args-words from the
     # shell and put them together to a single string here
     querystr = ' '.join(assure_list(query))
     # this gives a formal whoosh query
     wquery = self.parser.parse(querystr)
     return wquery
Beispiel #17
0
 def __call__(module=None, verbose=False, nocapture=False, pdb=False, stop=False):
     if not module:
         from pkg_resources import iter_entry_points
         module = ['datalad']
         module.extend(ep.module_name for ep in iter_entry_points('datalad.tests'))
     module = assure_list(module)
     lgr.info('Starting test run for module(s): %s', module)
     for mod in module:
         datalad.test(module=mod, verbose=verbose, nocapture=nocapture, pdb=pdb, stop=stop)
Beispiel #18
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res,
            ds,
            type='file',
            **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds,
            paths,
            respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Beispiel #19
0
 def custom_result_renderer(res, **kwargs):
     if res['status'] != 'ok' or \
             not res.get('action', None) == 'meta_extract':
         # logging complained about this already
         return
     if 'state' in res and 'extractor' in res:
         # extractor report, special treatment
         ui.message('{name}({state})'.format(
             name=ac.color_word(res['extractor'], ac.BOLD),
             state=','.join(
                 '{}{}{}{}'.format(
                     # boolean states get a + or - prefix
                     '+' if v is True else '-' if v is False else '',
                     k,
                     '=' if not isinstance(v, bool) else '',
                     v if not isinstance(v, bool) else '')
                 for k, v in iteritems(res['state'])
                 # this is an extractor property, and mostly serves
                 # internal purposes
                 if k not in ('unique_exclude', )),
         ))
         return
     if kwargs.get('format', None) == 'jsonld':
         # special case of a JSON-LD report request
         # all reports are consolidated into a single
         # graph, dumps just that (no pretty printing, can
         # be done outside)
         ui.message(
             jsondumps(
                 res['metadata'],
                 # support utf-8 output
                 ensure_ascii=False,
                 # this cannot happen, spare the checks
                 check_circular=False,
                 # this will cause the output to not necessarily be
                 # JSON compliant, but at least contain all info that went
                 # in, and be usable for javascript consumers
                 allow_nan=True,
             ))
         return
     # list the path, available metadata keys, and tags
     path = op.relpath(res['path'], res['refds']) if res.get(
         'refds', None) else res['path']
     meta = res.get('metadata', {})
     ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
         path=ac.color_word(path, ac.BOLD),
         type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA))
         if 'type' in res else '',
         spacer=' ' if len([m for m in meta if m != 'tag']) else '',
         meta=','.join(k for k in sorted(meta.keys())
                       if k not in ('tag', '@context', '@id'))
         if meta else ' -' if 'metadata' in res else ' {}'.format(','.join(
             e for e in res['extractors']
             if e not in ('datalad_core', 'metalad_core',
                          'metalad_annex'))) if 'extractors' in res else '',
         tags='' if 'tag' not in meta else ' [{}]'.format(','.join(
             assure_list(meta['tag'])))))
Beispiel #20
0
def _get_dsmeta_srcfiles(ds):
    # which files to look at
    cfg_srcfiles = ds.config.obtain('datalad.metadata.custom-dataset-source',
                                    [])
    cfg_srcfiles = assure_list(cfg_srcfiles)
    # OK to be always POSIX
    srcfiles = ['.metadata/dataset.json'] \
        if not cfg_srcfiles and op.lexists(
            text_type(ds.pathobj / '.metadata' / 'dataset.json')) \
        else cfg_srcfiles
    return srcfiles, cfg_srcfiles
Beispiel #21
0
 def result_renderer_cmdline(res, args):
     from datalad.ui import ui
     res = assure_list(res)
     if args.dryrun:
         ui.message('DRYRUN -- Anticipated results:')
     if not len(res):
         ui.message("Nothing done")
     else:
         for d, url, existed in res:
             ui.message("'{}'{} configured as sibling '{}' for {}".format(
                 url, " (existing repository)" if existed else '',
                 args.name, d))
Beispiel #22
0
def pipeline(url, project, project_access='public', subjects=None):
    # TODO: Ben: Clarify parameters. In particular `project_access` is unclear to me
    subjects = assure_list(subjects)

    xnat = XNATServer(url)

    def get_project_info(data):
        out = xnat('data/projects/%s' % project,
                   return_plain=True
                   )
        # for NITRC I need to get more!
        # "http://nitrc_es.projects.nitrc.org/datalad/%s" % dataset
        items = out['items']
        assert len(items) == 1
        dataset_meta = items[0]['data_fields']
        # TODO: save into a file
        yield data

    def get_files(data):

        for f in xnat.get_all_files_for_project(project, subjects=subjects):
            # TODO: tune up filename
            # TODO: get url
            prefix = '/data/experiments/'
            assert f['uri'].startswith('%s' % prefix)
            # TODO:  use label for subject/experiment
            # TODO: might want to allow for
            #   XNAT2BIDS whenever that one is available:
            #     http://reproducibility.stanford.edu/accepted-projects-for-the-2nd-crn-coding-sprint/
            exp_label = xnat.experiment_labels[f['experiment_id']]
            yield updated(data,
                          {'url': url + f['uri'],
                           'path': f['uri'][len(prefix):], 
                           'name': '%s-%s' % (exp_label, f['name'])
                           })

    annex = Annexificator(
        create=False,  # must be already initialized etc
        # leave in Git only obvious descriptors and code snippets -- the rest goes to annex
        # so may be eventually we could take advantage of git tags for changing layout
        statusdb='json',
        special_remotes=['datalad'] if project_access != 'public' else None
    )

    return [
        get_project_info,
        [
            get_files,
            annex
        ],
        annex.finalize(cleanup=True, aggregate=True),
    ]
Beispiel #23
0
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs):
    """Helper to drop content in datasets.

    Parameters
    ----------
    ds : Dataset
    paths : path or list(path)
      which content to drop
    check : bool
      whether to instruct annex to perform minimum copy availability
      checks
    noannex_iserror : bool
      whether calling this function on a pure Git repo results in an
      'impossible' or 'notneeded' result.
    **kwargs
      additional payload for the result dicts
    """
    if 'action' not in kwargs:
        kwargs['action'] = 'drop'
    # always need to make sure that we pass a list
    # `normalize_paths` decorator will otherwise screw all logic below
    paths = assure_list(paths)
    if not hasattr(ds.repo, 'drop'):
        for p in paths:
            r = get_status_dict(
                status='impossible' if noannex_iserror else 'notneeded',
                path=p if isabs(p) else normpath(opj(ds.path, p)),
                message="no annex'ed content",
                **kwargs)
            r['action'] = 'drop'
            yield r
        return

    opts = ['--force'] if not check else []
    respath_by_status = {}
    for res in ds.repo.drop(paths, options=opts):
        res = annexjson2result(
            # annex reports are always about files
            res, ds, type='file', **kwargs)
        success = success_status_map[res['status']]
        respath_by_status[success] = \
            respath_by_status.get(success, []) + [res['path']]
        yield res
    # report on things requested that annex was silent about
    for r in results_from_annex_noinfo(
            ds, paths, respath_by_status,
            dir_fail_msg='could not drop some content in %s %s',
            noinfo_dir_msg='nothing to drop from %s',
            noinfo_file_msg="no annex'ed content",
            **kwargs):
        r['action'] = 'drop'
        yield r
Beispiel #24
0
def _check_deps(repo, deps):
    """Check if all `deps` remotes are known to the `repo`

    Raises
    ------
    ValueError
      if any of the deps is an unknown remote
    """
    unknown_deps = set(assure_list(deps)).difference(repo.get_remotes())
    if unknown_deps:
        raise ValueError(
            'unknown sibling(s) specified as publication dependency: %s' %
            unknown_deps)
Beispiel #25
0
def superdataset_pipeline(url, limit=None, drop_empty=True):
    """
    
    Parameters
    ----------
    url
    limit : TODO, optional
      Types of access to limit to, see XNAT.get_datasets
    drop_empty: bool, optional
      If set, do not create datasets which are empty (no files).
      Note - it requires obtaining details for every project, which could be
      a heavy operation
    kwargs

    Returns
    -------

    """

    annex = Annexificator(no_annex=True, allow_dirty=False)
    lgr.info("Creating a pipeline with url=%s limit=%s drop_empty=%s", url, limit, drop_empty)
    limit = assure_list(limit)
    drop_empty = assure_bool(drop_empty)

    def get_projects(data):
        xnat = XNATServer(url)
        for p in xnat.get_projects(
                asdict=False,
                limit=limit or PROJECT_ACCESS_TYPES,
                drop_empty=drop_empty
        ):
            yield updated(data, p)

    return [
        get_projects,
        assign({'project': '%(id)s',
                'dataset_name': '%(id)s',
                'url': url
                }, interpolate=True),
        # TODO: should we respect  x quarantine_status
        annex.initiate_dataset(
            template="xnat",
            data_fields=['project', 'url', 'project_access'],  # TODO: may be project_access
            # let's all specs and modifications reside in master
            # branch='incoming',  # there will be archives etc
            existing='skip'
            # further any additional options
        )
    ]
Beispiel #26
0
 def result_renderer_cmdline(res, args):
     from datalad.ui import ui
     res = assure_list(res)
     if args.dryrun:
         ui.message('DRYRUN -- Anticipated results:')
     if not len(res):
         ui.message("Nothing done")
     else:
         for d, url, existed in res:
             ui.message(
                 "'{}'{} configured as sibling '{}' for {}".format(
                     url,
                     " (existing repository)" if existed else '',
                     args.name,
                     d))
Beispiel #27
0
    def __init__(self, dataset=None):
        """Retrieves the configured set of rules

        Rules are defined by classes ... + __datalad_hirni_rules
        datalad.hirni.dicom2spec.rules  ... multiple

        Parameters
        ----------
        dataset: Dataset
          Dataset to read possibly customized rules from
        """

        from datalad.utils import assure_list
        from datalad import cfg as dl_cfg
        from datalad_hirni.support.default_rules import DefaultRules
        cfg = dataset.config if dataset else dl_cfg

        self._rule_set = []
        # get a list of paths to build the rule set from
        # Note: assure_list is supposed to return empty list if there's nothing
        self._file_list = \
            assure_list(cfg.get("datalad.hirni.dicom2spec.rules"))
        lgr.debug("loaded list of rule files: %s", self._file_list)

        for file in self._file_list:
            if not op.exists(file) or not op.isfile(file):
                lgr.warning(
                    "Ignored invalid path for dicom2spec rules "
                    "definition: %s", file)
                continue

            from datalad.utils import import_module_from_file
            from datalad.dochelpers import exc_str
            try:
                mod = import_module_from_file(file)
            except Exception as e:
                # any exception means full stop
                raise ValueError("Rules definition file at {} is broken: {}"
                                 "".format(file, exc_str(e)))

            # check file's __datalad_hirni_rules for the actual class:
            if not hasattr(mod, "__datalad_hirni_rules"):
                raise ValueError("Rules definition file {} missed attribute "
                                 "'__datalad_hirni_rules'.".format(file))
            self._rule_set.append(getattr(mod, "__datalad_hirni_rules"))

        if not self._rule_set:
            self._rule_set = [DefaultRules]
def check_integration1(login,
                       keyring,
                       path,
                       organization=None,
                       kwargs={},
                       oauthtokens=None):
    kwargs = kwargs.copy()
    if organization:
        kwargs['github_organization'] = organization

    ds = Dataset(path).create()
    if oauthtokens:
        for oauthtoken in assure_list(oauthtokens):
            ds.config.add('hub.oauthtoken', oauthtoken, where='local')

    # so we do not pick up local repo configuration/token
    repo_name = 'test_integration1'
    with chpwd(path):
        # ATM all the github goodness does not care about "this dataset"
        # so force "process wide" cfg to pick up our defined above oauthtoken
        cfg.reload(force=True)
        # everything works just nice, no conflicts etc
        res = ds.create_sibling_github(repo_name, **kwargs)

        if organization:
            url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git'
        else:
            url_fmt = 'https://github.com/{login}/{repo_name}.git'
        eq_(res, [(ds, url_fmt.format(**locals()), False)])

        # but if we rerun - should kaboom since already has this sibling:
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, **kwargs)
        assert_in("already has a configured sibling", str(cme.exception))

        # but we can give it a new name, but it should kaboom since the remote one
        # exists already
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, name="github2", **kwargs)
        assert_in("already exists on", str(cme.exception))
        # we should not leave the broken sibling behind
        assert_not_in('github2', ds.repo.get_remotes())

        # If we ask to reconfigure - should proceed normally
        ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs)
    cfg.reload(force=True)
Beispiel #29
0
def check_integration1(login, keyring,
                       path,
                       organization=None,
                       kwargs={},
                       oauthtokens=None):
    kwargs = kwargs.copy()
    if organization:
        kwargs['github_organization'] = organization

    ds = Dataset(path).create()
    if oauthtokens:
        for oauthtoken in assure_list(oauthtokens):
            ds.config.add('hub.oauthtoken', oauthtoken, where='local')

    # so we do not pick up local repo configuration/token
    repo_name = 'test_integration1'
    with chpwd(path):
        # ATM all the github goodness does not care about "this dataset"
        # so force "process wide" cfg to pick up our defined above oauthtoken
        cfg.reload(force=True)
        # everything works just nice, no conflicts etc
        res = ds.create_sibling_github(repo_name, **kwargs)

        if organization:
            url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git'
        else:
            url_fmt = 'https://github.com/{login}/{repo_name}.git'
        eq_(res, [(ds, url_fmt.format(**locals()), False)])

        # but if we rerun - should kaboom since already has this sibling:
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, **kwargs)
        assert_in("already has a configured sibling", str(cme.exception))

        # but we can give it a new name, but it should kaboom since the remote one
        # exists already
        with assert_raises(ValueError) as cme:
            ds.create_sibling_github(repo_name, name="github2", **kwargs)
        assert_in("already exists on", str(cme.exception))
        # we should not leave the broken sibling behind
        assert_not_in('github2', ds.repo.get_remotes())

        # If we ask to reconfigure - should proceed normally
        ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs)
    cfg.reload(force=True)
Beispiel #30
0
 def custom_result_renderer(res, **kwargs):
     if res['status'] != 'ok' or not res.get('action', None) == 'metadata':
         # logging complained about this already
         return
     # list the path, available metadata keys, and tags
     path = relpath(res['path'], res['refds']) if res.get(
         'refds', None) else res['path']
     meta = res.get('metadata', {})
     ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
         path=ac.color_word(path, ac.BOLD),
         type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA))
         if 'type' in res else '',
         spacer=' ' if len([m for m in meta if m != 'tag']) else '',
         meta=','.join(k for k in sorted(meta.keys())
                       if k not in ('tag', '@context', '@id'))
         if meta else ' -' if 'metadata' in res else ' aggregated',
         tags='' if 'tag' not in meta else ' [{}]'.format(','.join(
             assure_list(meta['tag'])))))
Beispiel #31
0
 def custom_result_renderer(res, **kwargs):
     if res['status'] != 'ok' or not res.get('action', None) == 'metadata':
         # logging complained about this already
         return
     # list the path, available metadata keys, and tags
     path = op.relpath(res['path'],
                    res['refds']) if res.get('refds', None) else res['path']
     meta = res.get('metadata', {})
     ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
         path=ac.color_word(path, ac.BOLD),
         type=' ({})'.format(
             ac.color_word(res['type'], ac.MAGENTA)) if 'type' in res else '',
         spacer=' ' if len([m for m in meta if m != 'tag']) else '',
         meta=','.join(k for k in sorted(meta.keys())
                       if k not in ('tag', '@context', '@id'))
              if meta else ' -' if 'metadata' in res else ' aggregated',
         tags='' if 'tag' not in meta else ' [{}]'.format(
              ','.join(assure_list(meta['tag'])))))
Beispiel #32
0
def sort_paths_into_subdatasets(superds_path, target_subs, spec):
    # XXX forge a chain: whenever some path needs to be pushed down
    # put the receiving dataset as a components to process into the
    # respective superdataset -- this will enable further processing
    # of all datasets in a completely independent fashion
    # (except for order of processing)

    # get all existing subdataset as candidate nodes of the graph
    # that needs to be built and checked
    subds_graph = Dataset(superds_path).get_subdatasets(
        absolute=True, recursive=True, edges=True, fulfilled=True)
    if not subds_graph:
        # no subdatasets, nothing to sort
        return
    for t in target_subs:
        trace = get_trace(
            subds_graph,
            superds_path,
            t)
        if not trace:
            # not connected, or identical
            continue
        tosort = [superds_path] + trace + [t]
        # loop over all but the last one, simplifies logic below
        for i, d in enumerate(tosort[:-1]):
            paths = spec.get(d, [])
            keep_paths = []
            next_ds = tosort[i + 1]
            next_dspaths = spec.get(next_ds, [])
            comp = _with_sep(next_ds)
            for p in assure_list(paths):
                if p.startswith(comp):
                    next_dspaths.append(p)
                    # remember that we pushed the path into this dataset
                    keep_paths.append(next_ds)
                else:
                    keep_paths.append(p)
            spec[next_ds] = next_dspaths
            spec[d] = keep_paths
    # tidy up -- deduplicate
    for c in spec:
        spec[c] = list(set(spec[c]))
Beispiel #33
0
def results_from_paths(paths, action=None, type=None, logger=None, refds=None,
                       status=None, message=None):
    """
    Helper to yield analog result dicts for each path in a sequence.

    Parameters
    ----------
    message: str
      A result message. May contain `%s` which will be replaced by the
      respective `path`.

    Returns
    -------
    generator

    """
    for p in assure_list(paths):
        yield get_status_dict(
            action, path=p, type=type, logger=logger, refds=refds,
            status=status, message=(message, p) if '%s' in message else message)
Beispiel #34
0
    def put(self, source, destination, recursive=False, preserve_attrs=False):
        """Copies source file/folder to destination on the remote.

        Note: this method performs escaping of filenames to an extent that
        moderately weird ones should work (spaces, quotes, pipes, other
        characters with special shell meaning), but more complicated cases
        might require appropriate external preprocessing of filenames.

        Parameters
        ----------
        source : str or list
          file/folder path(s) to copy from on local
        destination : str
          file/folder path to copy to on remote
        recursive : bool
          flag to enable recursive copying of given sources
        preserve_attrs : bool
          preserve modification times, access times, and modes from the
          original file

        Returns
        -------
        str
          stdout, stderr of the copy operation.
        """
        # make sure we have an open connection, will test if action is needed
        # by itself
        self.open()
        scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs)
        # add source filepath(s) to scp command
        scp_cmd += assure_list(source)
        # add destination path
        scp_cmd += [
            '%s:%s' % (
                self.sshri.hostname,
                _quote_filename_for_scp(destination),
            )
        ]
        return self.runner.run(scp_cmd)
Beispiel #35
0
def is_result_matching_pathsource_argument(res, **kwargs):
    # we either have any non-zero number of "paths" (that could be anything), or
    # we have one path and one source
    # we don't do any error checking here, done by the command itself
    source = kwargs.get('source', None)
    if source is not None:
        # we want to be able to deal with Dataset instances given as 'source':
        if isinstance(source, Dataset):
            source = source.path
        # if there was a source, it needs to be recorded in the result
        # otherwise this is not what we are looking for
        return source == res.get('source_url', None)
    # the only thing left is a potentially heterogeneous list of paths/URLs
    paths = assure_list(kwargs.get('path', []))
    # three cases left:
    # 1. input arg was an absolute path -> must match 'path' property
    # 2. input arg was relative to a dataset -> must match refds/relpath
    # 3. something nifti with a relative input path that uses PWD as the
    #    reference
    respath = res.get('path', None)
    if respath in paths:
        # absolute match, pretty sure we want this
        return True
    elif kwargs.get('dataset', None) and YieldRelativePaths()(res) in paths:
        # command was called with a reference dataset, and a relative
        # path of a result matches in input argument -- not 100% exhaustive
        # test, but could be good enough
        return True
    elif any(robust_abspath(p) == respath for p in paths):
        # one absolutified input path matches the result path
        # I'd say: got for it!
        return True
    elif any(p == res.get('source_url', None) for p in paths):
        # this was installed from a URL that was given, we'll take that too
        return True
    else:
        return False
Beispiel #36
0
    def add(self, var, value, where='dataset', reload=True):
        """Add a configuration variable and value

        Parameters
        ----------
        var : str
          Variable name including any section like `git config` expects them, e.g.
          'core.editor'
        value : str
          Variable value
        %s"""
        if where == 'override':
            from datalad.utils import assure_list
            val = assure_list(self.overrides.pop(var, None))
            val.append(value)
            self.overrides[var] = val[0] if len(val) == 1 else val
            if reload:
                self.reload(force=True)
            return

        self._run(['--add', var, value],
                  where=where,
                  reload=reload,
                  log_stderr=True)
Beispiel #37
0
    def get_query(self, query):
        query = assure_list(query)
        simple_fieldspec = re.compile(r"(?P<field>\S*?):(?P<query>.*)")
        quoted_fieldspec = re.compile(r"'(?P<field>[^']+?)':(?P<query>.*)")
        query = [
            simple_fieldspec.match(q) or quoted_fieldspec.match(q) or q
            for q in query
        ]
        # expand matches, compile expressions
        query = [{
            k: re.compile(self._xfm_query(v))
            for k, v in q.groupdict().items()
        } if hasattr(q, 'groupdict') else re.compile(self._xfm_query(q))
                 for q in query]

        # turn "empty" field specs into simple queries
        # this is used to forcibly disable field-based search
        # e.g. when searching for a value
        query = [
            q['query']
            if isinstance(q, dict) and q['field'].pattern == '' else q
            for q in query
        ]
        return query
Beispiel #38
0
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.plugin.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import assure_list

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='add README')

        filename = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=filename)

        if lexists(filename) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(filename):
            dataset.unlock(filename)

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata(
            '.', reporton='datasets', return_type='item-or-list',
            on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warn("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
                ('', meta.get('description', meta.get('shortdescription', ''))),
                ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''),
                    u'\n'.join([u'- {}'.format(a) for a in assure_list(meta.get('author', []))])),
                ('Homepage', meta.get('homepage', '')),
                ('Reference', meta.get('citation', '')),
                ('License', meta.get('license', '')),
                ('Keywords', u', '.join([u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))])),
                ('Funding', meta.get('fundedby', '')),
                ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content=u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
            title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
            )

        with open(filename, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(
                status='ok',
                path=filename,
                type='file',
                action='add_readme')

        for r in dataset.save(
                filename,
                message='[DATALAD] added README',
                result_filter=None,
                result_xfm=None):
            yield r
Beispiel #39
0
    def __call__(name=None, url=None, dataset=None,
                 pushurl=None, recursive=False, fetch=False, force=False,
                 as_common_datasrc=None, publish_depends=None,
                 publish_by_default=None):

        # TODO: Detect malformed URL and fail?

        # XXX possibly fail if fetch is False and as_common_datasrc
        # not yet sure if that is an error
        if name is None or (url is None and pushurl is None):
            raise ValueError("""insufficient information to add a sibling
                (needs at least a dataset, a name and an URL).""")
        if url is None:
            url = pushurl

        ds = require_dataset(dataset, check_installed=True,
                             purpose='sibling addition')
        assert(ds.repo is not None)

        ds_basename = basename(ds.path)
        repos = OrderedDict()
        repos[ds_basename] = {'repo': ds.repo}

        if recursive:
            for subds_name in ds.get_subdatasets(recursive=True):
                subds_path = opj(ds.path, subds_name)
                subds = Dataset(subds_path)
                lgr.debug("Adding sub-dataset %s for adding a sibling",
                          subds_path)
                if not subds.is_installed():
                    lgr.info("Skipping adding sibling for %s since it "
                             "is not installed", subds)
                    continue
                repos[ds_basename + '/' + subds_name] = {
                    #                repos[subds_name] = {
                    'repo': GitRepo(subds_path, create=False)
                }

        # Note: This is copied from create_sibling
        # as it is the same logic as for its target_dir.
        # TODO: centralize and generalize template symbol handling
        # TODO: Check pushurl for template symbols too. Probably raise if only
        #       one of them uses such symbols

        replicate_local_structure = "%NAME" not in url

        for repo_name in repos:
            repo = repos[repo_name]
            if not replicate_local_structure:
                repo['url'] = url.replace("%NAME",
                                           repo_name.replace("/", "-"))
                if pushurl:
                    repo['pushurl'] = pushurl.replace("%NAME",
                                                       repo_name.replace("/",
                                                                          "-"))
            else:
                repo['url'] = url
                if pushurl:
                    repo['pushurl'] = pushurl

                if repo_name != ds_basename:
                    repo['url'] = _urljoin(repo['url'], repo_name[len(ds_basename) + 1:])
                    if pushurl:
                        repo['pushurl'] = _urljoin(repo['pushurl'], repo_name[len(ds_basename) + 1:])

        # define config var name for potential publication dependencies
        depvar = 'remote.{}.datalad-publish-depends'.format(name)

        # collect existing remotes:
        already_existing = list()
        conflicting = list()
        for repo_name in repos:
            repoinfo = repos[repo_name]
            repo = repoinfo['repo']
            if name in repo.get_remotes():
                already_existing.append(repo_name)
                lgr.debug("Remote '{0}' already exists "
                          "in '{1}'.""".format(name, repo_name))

                existing_url = repo.get_remote_url(name)
                existing_pushurl = \
                    repo.get_remote_url(name, push=True)

                if (not existing_url or repoinfo['url'].rstrip('/') != existing_url.rstrip('/')) \
                        or (pushurl and existing_pushurl and
                            repoinfo['pushurl'].rstrip('/') !=
                                    existing_pushurl.rstrip('/')) \
                        or (pushurl and not existing_pushurl) \
                        or (publish_depends and set(ds.config.get(depvar, [])) != set(publish_depends)):
                    conflicting.append(repo_name)

        if not force and conflicting:
            raise RuntimeError("Sibling '{0}' already exists with conflicting"
                               " URL for {1} dataset(s). {2}".format(
                                   name, len(conflicting), conflicting))

        successfully_added = list()
        for repo_name in repos:
            repoinfo = repos[repo_name]
            repo = repoinfo['repo']
            if repo_name in already_existing:
                if repo_name not in conflicting:
                    lgr.debug("Skipping {0}. Nothing to do.".format(repo_name))
                    continue
                # rewrite url
                repo.set_remote_url(name, repoinfo['url'])
            else:
                # add the remote
                repo.add_remote(name, repoinfo['url'])
            if pushurl:
                repo.set_remote_url(name, repoinfo['pushurl'], push=True)
            if fetch:
                # fetch the remote so we are up to date
                lgr.debug("Fetching sibling %s of %s", name, repo_name)
                repo.fetch(name)

            if publish_depends:
                if depvar in ds.config:
                    # config vars are incremental, so make sure we start from
                    # scratch
                    ds.config.unset(depvar, where='local', reload=False)
                for d in assure_list(publish_depends):
                    lgr.info(
                        'Configure additional publication dependency on "%s"',
                        d)
                    ds.config.add(depvar, d, where='local', reload=False)
                ds.config.reload()

            if publish_by_default:
                for refspec in assure_list(publish_by_default):
                    lgr.info(
                        'Configure additional default publication refspec "%s"',
                        refspec)
                    ds.config.add("remote.{}.push".format(name), refspec,
                                  'local')
                ds.config.reload()

            assert isinstance(repo, GitRepo)  # just against silly code
            if isinstance(repo, AnnexRepo):
                # we need to check if added sibling an annex, and try to enable it
                # another part of the fix for #463 and #432
                try:
                    if not ds.config.obtain(
                            'remote.{}.annex-ignore'.format(name),
                            default=False,
                            valtype=EnsureBool(),
                            store=False):
                        repo.enable_remote(name)
                except CommandError as exc:
                    lgr.info("Failed to enable annex remote %s, "
                             "could be a pure git" % name)
                    lgr.debug("Exception was: %s" % exc_str(exc))
                if as_common_datasrc:
                    ri = RI(repoinfo['url'])
                    if isinstance(ri, URL) and ri.scheme in ('http', 'https'):
                        # XXX what if there is already a special remote
                        # of this name? Above check for remotes ignores special
                        # remotes. we need to `git annex dead REMOTE` on reconfigure
                        # before we can init a new one
                        # XXX except it is not enough

                        # make special remote of type=git (see #335)
                        repo._run_annex_command(
                            'initremote',
                            annex_options=[
                                as_common_datasrc,
                                'type=git',
                                'location={}'.format(repoinfo['url']),
                                'autoenable=true'])
                    else:
                        lgr.warning(
                            'Not configuring "%s" as a common data source, '
                            'URL protocol is not http or https',
                            name)

            successfully_added.append(repo_name)

        return successfully_added
Beispiel #40
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            get_data=False,
            description=None,
            recursive=False,
            recursion_limit=None,
            if_dirty='save-before',
            save=True,
            reckless=False,
            git_opts=None,
            git_clone_opts=None,
            annex_opts=None,
            annex_init_opts=None,
            jobs=None):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")


        ## Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs, but now they
        # have duplicated implementations which differ (e.g. get does not
        # annex init installed annexes)
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            git_opts=git_opts,
            annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        installed_items = []
        failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='installation')
            handle_dirty_dataset(ds, if_dirty)

        # switch into scenario without --source:
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            for urlpath in path:
                ri = RI(urlpath)
                (to_get if isinstance(ri, PathRI) else to_install).append(urlpath)

            common_kwargs['dataset'] = dataset

            # first install, and then get
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                try:
                    result = Install.__call__(
                                    source=s,
                                    description=description,
                                    if_dirty=if_dirty,
                                    save=save,
                                    git_clone_opts=git_clone_opts,
                                    annex_init_opts=annex_init_opts,
                                    **common_kwargs
                                )
                    installed_items += assure_list(result)
                except Exception as exc:
                    lgr.warning("Installation of %s has failed: %s",
                                s, exc_str(exc))
                    failed_items.append(s)

            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts
                try:
                    installed_datasets = Get.__call__(
                        to_get,
                        # description=description,
                        # if_dirty=if_dirty,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts
                        _return_datasets=True,
                        **common_kwargs
                    )
                except IncompleteResultsError as exc:
                    exc_str_ = ': ' + exc_str(exc) if exc.results else ''
                    lgr.warning("Some items failed to install: %s",
                                exc_str_)
                    installed_datasets = exc.results
                    failed_items.extend(exc.failed)

                # compose content_by_ds into result
                for dspath in installed_datasets:
                    ds_ = Dataset(dspath)
                    if ds_.is_installed():
                        installed_items.append(ds_)
                    else:
                        lgr.warning("%s was not installed", ds_)

            return Install._handle_and_return_installed_items(
                ds, installed_items, failed_items, save)

        if source and path and len(path) > 1:
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use `save` %s".format(
                    path))

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError(
                    "invalid path argument {}: ({})".format(path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # URL doesn't point to a local something
                # so we have an actual URL in `path`. Since this is valid as a
                # single positional argument, `source` has to be None at this
                # point.
                if is_datalad_compat_ri(path) and source is None:
                    # we have an actual URL -> this should be the source
                    lgr.debug(
                        "Single argument given to install, that doesn't seem to "
                        "be a local path. "
                        "Assuming the argument identifies a source location.")
                    source = path
                    path = None

                else:
                    # `path` is neither a valid source nor a local path.
                    # TODO: The only thing left is a known subdataset with a
                    # name, that is not a path; Once we correctly distinguish
                    # between path and name of a submodule, we need to consider
                    # this.
                    # For now: Just raise
                    raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source = _get_git_url_from_source(source)
        lgr.debug("Resolved source: {0}".format(source))
        # TODO: we probably need to resolve source, if it is a local path;
        # expandpath, normpath, ... Where exactly is the point to do it?

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset nor target installation path provided. "
                "Deriving destination path from given source %s",
                source)
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        lgr.debug("Resolved installation target: {0}".format(path))
        destination_dataset = Dataset(path)

        if destination_dataset.is_installed():
            # this should not be, check if this is an error, or a reinstall
            # from the same source
            # this is where we would have installed this from
            candidate_sources = _get_flexible_source_candidates(
                source, destination_dataset.path)
            # this is where it was installed from
            track_name, track_url = _get_tracking_source(destination_dataset)
            if track_url in candidate_sources or get_local_file_url(track_url):
                # TODO: this one breaks "promise" assumptions of the repeated
                # invocations of install.
                # yoh thinks that we actually should be the ones to run update
                # (without merge) after basic
                # check that it is clean and up-to-date with its super dataset
                # and if so, not return here but continue with errands (recursive
                # installation and get_data) so we could provide the same
                # result if we rerun the same install twice.
                lgr.info(
                    "%s was already installed from %s. Use `update` to obtain "
                    "latest updates, or `get` or `install` with a path, not URL, "
                    "to (re)fetch data and / or subdatasets",
                    destination_dataset, track_url)
                return destination_dataset
            else:
                raise ValueError("There is already a dataset installed at the "
                                 "destination: %s", destination_dataset)

        ###########
        # we should know everything necessary by now
        # actual installation starts
        ###########

        # FLOW GUIDE:
        # four cases:
        # 1. install into a dataset
        #   1.1. we install a known subdataset
        #        => git submodule update --init
        #   1.2. we install an existing repo as a subdataset inplace
        #        => git submodule add + magic
        #   1.3. we (recursively) try to install implicit subdatasets between
        #        ds and path
        #   1.4. we install a new subdataset from an explicit source
        #        => git submodule add
        # 2. we "just" install from an explicit source
        #    => git clone

        if ds is not None:
            # FLOW GUIDE: 1.

            # express the destination path relative to the root of
            # the dataset
            relativepath = relpath(path, start=ds.path)
            if relativepath.startswith(pardir):
                raise ValueError("installation path outside dataset "
                                 "({0})".format(path))
            lgr.debug("Resolved installation target relative to dataset "
                      "{0}: {1}".format(ds, relativepath))

            # FLOW_GUIDE 1.4.
            lgr.info("Installing subdataset from '{0}' at: {0}".format(
                source, relativepath))
            destination_dataset = _install_subds_from_flexible_source(
                ds,
                relativepath,
                source,
                reckless)
        else:
            # FLOW GUIDE: 2.
            lgr.info("Installing dataset at {0} from {1}".format(path, source))

            # Currently assuming there is nothing at the target to deal with
            # and rely on failures raising from the git call ...

            # We possibly need to consider /.git URL
            candidate_sources = _get_flexible_source_candidates(source)
            _clone_from_any_source(candidate_sources, destination_dataset.path)

        # FLOW GUIDE: All four cases done.
        if not destination_dataset.is_installed():
            # XXX  shouldn't we just fail!? (unless some explicit --skip-failing?)
            lgr.error("Installation failed.")
            return None

        _handle_possible_annex_dataset(destination_dataset, reckless)

        lgr.debug("Installation of %s done.", destination_dataset)

        if not destination_dataset.is_installed():
            # log error and don't report as installed item, but don't raise,
            # since we might be in a process of recursive installation where
            # a lot of other datasets can still be installed successfully.
            lgr.error("Installation of {0} failed.".format(destination_dataset))
        else:
            installed_items.append(destination_dataset)

        # we need to decrease the recursion limit, relative to
        # subdatasets now
        subds_recursion_limit = max(0, recursion_limit - 1) \
                                  if isinstance(recursion_limit, int) \
                                  else recursion_limit
        # Now, recursive calls:
        if recursive:
            if description:
                # yoh: why?  especially if we somehow allow for templating them
                # with e.g. '%s' to catch the subdataset path
                lgr.warning("Description can't be assigned recursively.")

            subs = destination_dataset.get_subdatasets(
                # yes, it does make sense to combine no recursion with
                # recursion_limit: when the latter is 0 we get no subdatasets
                # reported, otherwise we always get the 1st-level subs
                recursive=False,
                recursion_limit=recursion_limit,
                absolute=False)

            if subs:
                lgr.debug("Obtaining subdatasets of %s: %s",
                          destination_dataset,
                          subs)

                kwargs = common_kwargs.copy()
                kwargs['recursion_limit'] = subds_recursion_limit
                rec_installed = Get.__call__(
                    subs,  # all at once
                    dataset=destination_dataset,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    **kwargs
                )
                # TODO do we want to filter this so `install` only returns
                # the datasets?
                if isinstance(rec_installed, list):
                    installed_items.extend(rec_installed)
                else:
                    installed_items.append(rec_installed)

        if get_data:
            lgr.debug("Getting data of {0}".format(destination_dataset))
            kwargs = common_kwargs.copy()
            kwargs['recursive'] = False
            destination_dataset.get(curdir, **kwargs)

        return Install._handle_and_return_installed_items(
            ds, installed_items, failed_items, save)
Beispiel #41
0
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
Beispiel #42
0
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.plugin.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import assure_list

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='add README')

        filename = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=filename)

        if lexists(filename) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(filename):
            dataset.unlock(filename)

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata('.',
                                  reporton='datasets',
                                  return_type='item-or-list',
                                  on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warn("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
            ('', meta.get('description', meta.get('shortdescription', ''))),
            ('Author{}'.format(
                's' if isinstance(meta.get('author', None), list) else ''),
             u'\n'.join([
                 u'- {}'.format(a) for a in assure_list(meta.get('author', []))
             ])),
            ('Homepage', meta.get('homepage', '')),
            ('Reference', meta.get('citation', '')),
            ('License', meta.get('license', '')),
            ('Keywords', u', '.join([
                u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))
            ])),
            ('Funding', meta.get('fundedby', '')),
        ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content = u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
            title='Dataset "{}"'.format(meta['title'])
            if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
        )

        with open(filename,
                  'a' if existing == 'append' else 'w',
                  encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(status='ok',
                       path=filename,
                       type='file',
                       action='add_readme')

        for r in dataset.rev_save(filename,
                                  message='[DATALAD] added README',
                                  result_filter=None,
                                  result_xfm=None):
            yield r
Beispiel #43
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, to_save, objid, metasources,
                      refcommit, subds_relpaths, agg_base_path):
    lgr.debug('Performing metadata extraction from %s', aggfrom_ds)
    # we will replace any conflicting info on this dataset with fresh stuff
    agginfo = db.get(aggfrom_ds.path, {})
    # paths to extract from
    relevant_paths = sorted(_get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    # get extractors to engage from source dataset
    nativetypes = ['datalad_core', 'annex'] + assure_list(get_metadata_type(aggfrom_ds))
    # store esssential extraction config in dataset record
    agginfo['extractors'] = nativetypes
    agginfo['datalad_version'] = datalad.__version__

    # perform the actual extraction
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        nativetypes,
        # None indicates to honor a datasets per-extractor configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    meta = {
        'ds': dsmeta,
        'cn': (dict(contentmeta[k], path=k) for k in sorted(contentmeta))
    }

    # inject the info which commmit we are describing into the core metadata
    # this is done here in order to avoid feeding it all the way down
    coremeta = dsmeta.get('datalad_core', {})
    version = aggfrom_ds.repo.describe(commitish=refcommit)
    if version:
        coremeta['version'] = version
    coremeta['refcommit'] = refcommit
    dsmeta['datalad_core'] = coremeta

    # for both types of metadata
    for label, props in metasources.items():
        dest = props['targetds']
        if not meta[label]:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label, props['dumper'])
        # place metadata object into the source dataset
        objpath = op.join(dest.path, agg_base_path, objrelpath)

        # write obj files
        if op.exists(objpath):
            dest.unlock(objpath)
        elif op.lexists(objpath):
            # if it gets here, we have a symlink that is pointing nowhere
            # kill it, to be replaced with the newly aggregated content
            dest.repo.remove(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        props['dumper'](meta[label], objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(props['type'])] = objpath

    # overwrite existing info with stuff from just finished extraction
    db[aggfrom_ds.path] = agginfo

    return errored
Beispiel #44
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = dict()
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format(
            vocabulary_version)}

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn(
                '{} files have no content present, '
                'some extractors will not operate on {}'.format(
                    nocontent,
                    'them' if nocontent > 10
                           else [p for p, c, a in content_info if not c and a])
            )

    # pull out potential metadata field blacklist config settings
    blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain(
        'datalad.metadata.aggregate-ignore-fields',
        default=[]))]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')}

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s', ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(
            lgr.info,
            'metadataextractors',
            'Engage %s metadata extractor', mtype_key,
            update=1,
            increment=True)
        if mtype_key not in extractors:
            # we said that we want to fail, rather then just moan about less metadata
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                'Enabled metadata extractor %s is not available in this installation',
                mtype_key)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s', mtype_key, ds,
            )
            raise ValueError(
                "Failed to load metadata extractor for '%s', "
                "broken dataset configuration (%s)?: %s",
                mtype, ds, exc_str(e))
            continue
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s', mtype_key, ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(
                    dsmeta_t,
                    maxsize=max_fieldsize,
                    blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            # apply filters
            meta = _filter_metadata_fields(
                meta,
                maxsize=max_fieldsize,
                blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain(
                    'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in iteritems(meta):
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})
            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {k: _ensure_serializable(v) for k, v in iteritems(val)}
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [_ensure_serializable(i)
                    for i in sorted(
                        v,
                        key=_unique_value_key)] if v is not None else None
                for k, v in iteritems(unique_cm)
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})}
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s', ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
Beispiel #45
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = Interface.get_refds_path(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = assure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(ds)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='aggregate_metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                path=to_save,
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Beispiel #46
0
def only_matching_paths(res, **kwargs):
    # TODO handle relative paths by using a contained 'refds' value
    paths = assure_list(kwargs.get('path', []))
    respath = res.get('path', None)
    return respath in paths
Beispiel #47
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedure path and configuration

    Order of consideration is user-level, system-level, dataset,
    datalad extensions, datalad. First one found according to this order is the
    one to be returned. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you do execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Returns
    -------
    tuple
      path, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (m, n,) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(
                ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (m, n,) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, n, f, h in _get_procedure_implementation(name=name, ds=subds):
                yield m, n, f, h

    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(
                        entry_point.module_name,
                        'resources/procedures'),
                    name):
                yield (m, n,) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'),
            name):
        yield (m, n,) + _get_proc_config(n)
Beispiel #48
0
def _get_submodules(dspath, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(dspath)
    # write access parser
    parser = None
    # TODO bring back in more global scope from below once segfaults are
    # figured out
    #if set_property or delete_property:
    #    gitmodule_path = opj(dspath, ".gitmodules")
    #    parser = GitConfigParser(
    #        gitmodule_path, read_only=False, merge_includes=False)
    #    parser.read()
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(dspath):
        if contains and not path_startswith(contains, sm['path']):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        sm.update(modinfo.get(sm['path'], {}))
        if set_property or delete_property:
            gitmodule_path = opj(dspath, ".gitmodules")
            parser = GitConfigParser(
                gitmodule_path, read_only=False, merge_includes=False)
            parser.read()
            # do modifications now before we read the info out for reporting
            # use 'submodule "NAME"' section ID style as this seems to be the default
            submodule_section = 'submodule "{}"'.format(sm['gitmodule_name'])
            # first deletions
            for dprop in assure_list(delete_property):
                parser.remove_option(submodule_section, dprop)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=relpath(sm['path'], refds_path),
                            refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-')))
                parser.set_value(
                    submodule_section,
                    prop,
                    val)
                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).add(
                '.gitmodules', to_git=True,
                message='[DATALAD] modified subdataset properties')
            # let go of resources, locks, ...
            parser.release()

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    sm['path'],
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres
    if parser is not None:
        # release parser lock manually, auto-cleanup is not reliable in PY3
        parser.release()
Beispiel #49
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Beispiel #50
0
    def __call__(path=None, message=None, dataset=None,
                 version_tag=None,
                 recursive=False, recursion_limit=None,
                 updated=False,
                 message_file=None,
                 to_git=None,
                 ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(
                rootds, children, [], edges, includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))}
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True if not hasattr(ds.repo, 'annexstatus')
                        else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = text_type(
                                # recode path back to dataset path anchor
                                pds.pathobj / res[k].relative_to(
                                    pds.repo.pathobj)
                            )
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha()
                else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(
                    status='ok',
                    version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(
                    status='error',
                    message=('cannot tag this version: %s', e.stderr.strip()))
                yield dsres
Beispiel #51
0
def rev_resolve_path(path, ds=None):
    """Resolve a path specification (against a Dataset location)

    Any path is returned as an absolute path. If, and only if, a dataset
    object instance is given as `ds`, relative paths are interpreted as
    relative to the given dataset. In all other cases, relative paths are
    treated as relative to the current working directory.

    Note however, that this function is not able to resolve arbitrarily
    obfuscated path specifications. All operations are purely lexical, and no
    actual path resolution against the filesystem content is performed.
    Consequently, common relative path arguments like '../something' (relative
    to PWD) can be handled properly, but things like 'down/../under' cannot, as
    resolving this path properly depends on the actual target of any
    (potential) symlink leading up to '..'.

    Parameters
    ----------
    path : str or PathLike or list
      Platform-specific path specific path specification. Multiple path
      specifications can be given as a list
    ds : Dataset or None
      Dataset instance to resolve relative paths against.

    Returns
    -------
    `pathlib.Path` object or list(Path)
      When a list was given as input a list is returned, a Path instance
      otherwise.
    """
    got_ds_instance = isinstance(ds, Dataset)
    if ds is not None and not got_ds_instance:
        ds = require_dataset(
            ds, check_installed=False, purpose='path resolution')
    out = []
    for p in assure_list(path):
        if ds is None or not got_ds_instance:
            # no dataset at all or no instance provided -> CWD is always the reference
            # nothing needs to be done here. Path-conversion and absolutification
            # are done next
            pass
        # we have a given datasets instance
        elif not Path(p).is_absolute():
            # we have a dataset and no abspath nor an explicit relative path ->
            # resolve it against the dataset
            p = ds.pathobj / p

        p = ut.Path(p)

        # make sure we return an absolute path, but without actually
        # resolving anything
        if not p.is_absolute():
            # in general it is almost impossible to use resolve() when
            # we can have symlinks in the root path of a dataset
            # (that we don't want to resolve here), symlinks to annex'ed
            # files (that we never want to resolve), and other within-repo
            # symlinks that we (sometimes) want to resolve (i.e. symlinked
            # paths for addressing content vs adding content)
            # CONCEPT: do the minimal thing to catch most real-world inputs
            # ASSUMPTION: the only sane relative path input that needs
            # handling and can be handled are upward references like
            # '../../some/that', wherease stuff like 'down/../someotherdown'
            # are intellectual excercises
            # ALGORITHM: match any number of leading '..' path components
            # and shorten the PWD by that number
            # NOT using ut.Path.cwd(), because it has symlinks resolved!!
            pwd_parts = ut.Path(getpwd()).parts
            path_parts = p.parts
            leading_parents = 0
            for pp in p.parts:
                if pp == op.pardir:
                    leading_parents += 1
                    path_parts = path_parts[1:]
                elif pp == op.curdir:
                    # we want to discard that, but without stripping
                    # a corresponding parent
                    path_parts = path_parts[1:]
                else:
                    break
            p = ut.Path(
                op.join(
                    *(pwd_parts[:-leading_parents if leading_parents else None]
                      + path_parts)))
        # note that we will not "normpath()" the result, check the
        # pathlib docs for why this is the only sane choice in the
        # face of the possibility of symlinks in the path
        out.append(p)
    return out[0] if isinstance(path, (string_types, PurePath)) else out
Beispiel #52
0
    def __call__(
            path=None,
            source=None,
            dataset=None,
            get_data=False,
            description=None,
            recursive=False,
            recursion_limit=None,
            save=True,
            reckless=False,
            # git_opts=None,
            # git_clone_opts=None,
            # annex_opts=None,
            # annex_init_opts=None,
            jobs="auto"):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = assure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        #  Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            # git_opts=git_opts,
            # annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        #installed_items = []
        #failed_items = []

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='installation')
            common_kwargs['dataset'] = dataset

        # switch into the two scenarios without --source:
        # 1. list of URLs
        # 2. list of (sub)dataset content
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            # TODO: this approach is problematic, it disrupts the order of input args.
            # consequently results will be returned in an unexpected order when a
            # mixture of source URL and paths is given. Reordering is only possible when
            # everything in here is fully processed before any results can be yielded.
            # moreover, I think the semantics of the status quo implementation are a
            # bit complicated: in a mixture list a source URL will lead to a new dataset
            # at a generated default location, but a path will lead to a subdataset
            # at that exact location
            for urlpath in path:
                ri = RI(urlpath)
                (to_get if isinstance(ri, PathRI) else to_install).append(urlpath)

            # 1. multiple source URLs
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                for r in Install.__call__(
                        source=s,
                        description=description,
                        save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of the installed content on disk
                    # should be necessary here, all done by code further
                    # down that deals with an install from an actuall `source`
                    # any necessary fixes should go there too!
                    # TODO generator: possibly adjust refds
                    yield r

            # 2. one or more dataset content paths
            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts

                for r in Get.__call__(
                        to_get,
                        # TODO should pass-through description, not sure why disabled
                        # description=description,
                        # save=save,
                        # git_clone_opts=git_clone_opts,
                        # annex_init_opts=annex_init_opts,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of get'ed content on disk should be
                    # necessary here, this is the responsibility of `get`
                    # (incl. adjusting parent's gitmodules when submodules end
                    # up in an "updated" state (done in get helpers)
                    # any required fixes should go there!
                    yield r

            # we are done here
            # the rest is about install from a `source`
            return

        # an actual `source` was given
        if source and path and len(path) > 1:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None
        # pre-compute for results below
        refds_path = Interface.get_refds_path(ds)

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            yield get_status_dict(
                'install', path=path, status='impossible', logger=lgr,
                source_url=source, refds=refds_path,
                message="installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use the `add` command")
            return

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # MIH everything in here is highly similar to what common
            # interface helpers do (or should/could do), but at the same
            # is very much tailored to just apply to `install` -- I guess
            # it has to stay special

            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError(
                    "invalid path argument {}: ({})".format(path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                path = resolve_path(path_ri.localpath, dataset)
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # `path` is neither a valid source nor a local path.
                # TODO: The only thing left is a known subdataset with a
                # name, that is not a path; Once we correctly distinguish
                # between path and name of a submodule, we need to consider
                # this.
                # For now: Just raise
                raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # clone dataset, will also take care of adding to superdataset, if one
        # is given
        res = Clone.__call__(
            source, path, dataset=ds, description=description,
            reckless=reckless,
            # we need to disable error handling in order to have it done at
            # the very top, otherwise we are not able to order a global
            # "ignore-and-keep-going"
            result_xfm=None,
            return_type='generator',
            result_filter=None,
            on_failure='ignore')
        # helper
        as_ds = YieldDatasets()
        destination_dataset = None
        for r in res:
            if r['action'] == 'install' and r['type'] == 'dataset':
                # make sure logic below is valid, only one dataset result is
                # coming back
                assert(destination_dataset is None)
                destination_dataset = as_ds(r)
            yield r
        assert(destination_dataset)

        # Now, recursive calls:
        if recursive or get_data:
            # dataset argument must not be passed inside since we use bound .get
            # It is ok to do "inplace" as long as we still return right
            # after the loop ends
            common_kwargs.pop('dataset', '')
            for r in destination_dataset.get(
                    curdir,
                    description=description,
                    # TODO expose this
                    # yoh: exactly!
                    #annex_get_opts=annex_get_opts,
                    # we need to disable error handling in order to have it done at
                    # the very top, otherwise we are not able to order a global
                    # "ignore-and-keep-going"
                    on_failure='ignore',
                    return_type='generator',
                    result_xfm=None,
                    **common_kwargs):
                yield r
        # at this point no futher post-processing should be necessary,
        # `clone` and `get` must have done that (incl. parent handling)
        # if not, bugs should be fixed in those commands
        return
Beispiel #53
0
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch,
                      description, as_common_datasrc, publish_depends,
                      publish_by_default, annex_wanted, annex_required,
                      annex_group, annex_groupwanted, inherit, get_annex_info,
                      **res_kwargs):
    result_props = dict(action='configure-sibling',
                        path=ds.path,
                        type='sibling',
                        name=name,
                        **res_kwargs)
    if name is None:
        result_props['status'] = 'error'
        result_props['message'] = 'need sibling `name` for configuration'
        yield result_props
        return

    if name != 'here':
        # do all configure steps that are not meaningful for the 'here' sibling
        # AKA the local repo
        if name not in known_remotes:
            # this remote is fresh: make it known
            # just minimalistic name and URL, the rest is coming from `configure`
            ds.repo.add_remote(name, url)
            known_remotes.append(name)
        elif url:
            # not new, override URl if given
            ds.repo.set_remote_url(name, url)

        # make sure we have a configured fetch expression at this point
        fetchvar = 'remote.{}.fetch'.format(name)
        if fetchvar not in ds.repo.config:
            # place default fetch refspec in config
            # same as `git remote add` would have added
            ds.repo.config.add(fetchvar,
                               '+refs/heads/*:refs/remotes/{}/*'.format(name),
                               where='local')

        if pushurl:
            ds.repo.set_remote_url(name, pushurl, push=True)

        if publish_depends:
            # Check if all `deps` remotes are known to the `repo`
            unknown_deps = set(
                assure_list(publish_depends)).difference(known_remotes)
            if unknown_deps:
                result_props['status'] = 'error'
                result_props['message'] = (
                    'unknown sibling(s) specified as publication dependency: %s',
                    unknown_deps)
                yield result_props
                return

        # define config var name for potential publication dependencies
        depvar = 'remote.{}.datalad-publish-depends'.format(name)
        # and default pushes
        dfltvar = "remote.{}.push".format(name)

        if fetch:
            # fetch the remote so we are up to date
            for r in Update.__call__(dataset=res_kwargs['refds'],
                                     path=[dict(path=ds.path, type='dataset')],
                                     sibling=name,
                                     merge=False,
                                     recursive=False,
                                     on_failure='ignore',
                                     return_type='generator',
                                     result_xfm=None):
                # fixup refds
                r.update(res_kwargs)
                yield r

        if inherit:
            # Adjust variables which we should inherit
            delayed_super = _DelayedSuper(ds.repo)
            publish_depends = _inherit_config_var(delayed_super, depvar,
                                                  publish_depends)
            publish_by_default = _inherit_config_var(delayed_super, dfltvar,
                                                     publish_by_default)
            # Copy relevant annex settings for the sibling
            # makes sense only if current AND super are annexes, so it is
            # kinda a boomer, since then forbids having a super a pure git
            if isinstance(ds.repo, AnnexRepo) and \
                    isinstance(delayed_super.repo, AnnexRepo):
                if annex_wanted is None:
                    annex_wanted = _inherit_annex_var(delayed_super, name,
                                                      'wanted')
                if annex_required is None:
                    annex_required = _inherit_annex_var(
                        delayed_super, name, 'required')
                if annex_group is None:
                    # I think it might be worth inheritting group regardless what
                    # value is
                    #if annex_wanted in {'groupwanted', 'standard'}:
                    annex_group = _inherit_annex_var(delayed_super, name,
                                                     'group')
                if annex_wanted == 'groupwanted' and annex_groupwanted is None:
                    # we better have a value for the expression for that group
                    annex_groupwanted = _inherit_annex_var(
                        delayed_super, name, 'groupwanted')

        if publish_depends:
            if depvar in ds.config:
                # config vars are incremental, so make sure we start from
                # scratch
                ds.config.unset(depvar, where='local', reload=False)
            for d in assure_list(publish_depends):
                lgr.info('Configure additional publication dependency on "%s"',
                         d)
                ds.config.add(depvar, d, where='local', reload=False)
            ds.config.reload()

        if publish_by_default:
            if dfltvar in ds.config:
                ds.config.unset(dfltvar, where='local', reload=False)
            for refspec in assure_list(publish_by_default):
                lgr.info(
                    'Configure additional default publication refspec "%s"',
                    refspec)
                ds.config.add(dfltvar, refspec, 'local')
            ds.config.reload()

        assert isinstance(ds.repo, GitRepo)  # just against silly code
        if isinstance(ds.repo, AnnexRepo):
            # we need to check if added sibling an annex, and try to enable it
            # another part of the fix for #463 and #432
            try:
                if not ds.config.obtain('remote.{}.annex-ignore'.format(name),
                                        default=False,
                                        valtype=EnsureBool(),
                                        store=False):
                    ds.repo.enable_remote(name)
            except CommandError as exc:
                # TODO yield
                # this is unlikely to ever happen, now done for AnnexRepo instances
                # only
                lgr.info("Failed to enable annex remote %s, "
                         "could be a pure git" % name)
                lgr.debug("Exception was: %s" % exc_str(exc))
            if as_common_datasrc:
                ri = RI(url)
                if isinstance(ri, URL) and ri.scheme in ('http', 'https'):
                    # XXX what if there is already a special remote
                    # of this name? Above check for remotes ignores special
                    # remotes. we need to `git annex dead REMOTE` on reconfigure
                    # before we can init a new one
                    # XXX except it is not enough

                    # make special remote of type=git (see #335)
                    ds.repo._run_annex_command('initremote',
                                               annex_options=[
                                                   as_common_datasrc,
                                                   'type=git',
                                                   'location={}'.format(url),
                                                   'autoenable=true'
                                               ])
                else:
                    yield dict(
                        status='impossible',
                        name=name,
                        message='cannot configure as a common data source, '
                        'URL protocol is not http or https',
                        **result_props)
    #
    # place configure steps that also work for 'here' below
    #
    if isinstance(ds.repo, AnnexRepo):
        for prop, var in (('wanted', annex_wanted),
                          ('required', annex_required), ('group',
                                                         annex_group)):
            if var is not None:
                ds.repo.set_preferred_content(prop, var,
                                              '.' if name == 'here' else name)
        if annex_groupwanted:
            ds.repo.set_groupwanted(annex_group, annex_groupwanted)

    if description:
        if not isinstance(ds.repo, AnnexRepo):
            result_props['status'] = 'impossible'
            result_props[
                'message'] = 'cannot set description of a plain Git repository'
            yield result_props
            return
        ds.repo._run_annex_command('describe',
                                   annex_options=[name, description])

    # report all we know at once
    info = list(
        _query_remotes(ds, name, known_remotes,
                       get_annex_info=get_annex_info))[0]
    info.update(dict(status='ok', **result_props))
    yield info
Beispiel #54
0
def _configure_remote(
        ds, name, known_remotes, url, pushurl, fetch, description,
        as_common_datasrc, publish_depends, publish_by_default,
        annex_wanted, annex_required, annex_group, annex_groupwanted,
        inherit, get_annex_info,
        **res_kwargs):
    result_props = dict(
        action='configure-sibling',
        path=ds.path,
        type='sibling',
        name=name,
        **res_kwargs)
    if name is None:
        result_props['status'] = 'error'
        result_props['message'] = 'need sibling `name` for configuration'
        yield result_props
        return

    if name != 'here':
        # do all configure steps that are not meaningful for the 'here' sibling
        # AKA the local repo
        if name not in known_remotes:
            # this remote is fresh: make it known
            # just minimalistic name and URL, the rest is coming from `configure`
            ds.repo.add_remote(name, url)
            known_remotes.append(name)
        elif url:
            # not new, override URl if given
            ds.repo.set_remote_url(name, url)

        # make sure we have a configured fetch expression at this point
        fetchvar = 'remote.{}.fetch'.format(name)
        if fetchvar not in ds.repo.config:
            # place default fetch refspec in config
            # same as `git remote add` would have added
            ds.repo.config.add(
                fetchvar,
                '+refs/heads/*:refs/remotes/{}/*'.format(name),
                where='local')

        if pushurl:
            ds.repo.set_remote_url(name, pushurl, push=True)

        if publish_depends:
            # Check if all `deps` remotes are known to the `repo`
            unknown_deps = set(assure_list(publish_depends)).difference(
                known_remotes)
            if unknown_deps:
                result_props['status'] = 'error'
                result_props['message'] = (
                    'unknown sibling(s) specified as publication dependency: %s',
                    unknown_deps)
                yield result_props
                return

        # define config var name for potential publication dependencies
        depvar = 'remote.{}.datalad-publish-depends'.format(name)
        # and default pushes
        dfltvar = "remote.{}.push".format(name)

        if fetch:
            # fetch the remote so we are up to date
            for r in Update.__call__(
                    dataset=res_kwargs['refds'],
                    path=[dict(path=ds.path, type='dataset')],
                    sibling=name,
                    merge=False,
                    recursive=False,
                    on_failure='ignore',
                    return_type='generator',
                    result_xfm=None):
                # fixup refds
                r.update(res_kwargs)
                yield r

        if inherit:
            # Adjust variables which we should inherit
            delayed_super = _DelayedSuper(ds.repo)
            publish_depends = _inherit_config_var(
                delayed_super, depvar, publish_depends)
            publish_by_default = _inherit_config_var(
                delayed_super, dfltvar, publish_by_default)
            # Copy relevant annex settings for the sibling
            # makes sense only if current AND super are annexes, so it is
            # kinda a boomer, since then forbids having a super a pure git
            if isinstance(ds.repo, AnnexRepo) and \
                    isinstance(delayed_super.repo, AnnexRepo):
                if annex_wanted is None:
                    annex_wanted = _inherit_annex_var(
                        delayed_super, name, 'wanted')
                if annex_required is None:
                    annex_required = _inherit_annex_var(
                        delayed_super, name, 'required')
                if annex_group is None:
                    # I think it might be worth inheritting group regardless what
                    # value is
                    #if annex_wanted in {'groupwanted', 'standard'}:
                    annex_group = _inherit_annex_var(
                        delayed_super, name, 'group'
                    )
                if annex_wanted == 'groupwanted' and annex_groupwanted is None:
                    # we better have a value for the expression for that group
                    annex_groupwanted = _inherit_annex_var(
                        delayed_super, name, 'groupwanted'
                    )

        if publish_depends:
            if depvar in ds.config:
                # config vars are incremental, so make sure we start from
                # scratch
                ds.config.unset(depvar, where='local', reload=False)
            for d in assure_list(publish_depends):
                lgr.info(
                    'Configure additional publication dependency on "%s"',
                    d)
                ds.config.add(depvar, d, where='local', reload=False)
            ds.config.reload()

        if publish_by_default:
            if dfltvar in ds.config:
                ds.config.unset(dfltvar, where='local', reload=False)
            for refspec in assure_list(publish_by_default):
                lgr.info(
                    'Configure additional default publication refspec "%s"',
                    refspec)
                ds.config.add(dfltvar, refspec, 'local')
            ds.config.reload()

        assert isinstance(ds.repo, GitRepo)  # just against silly code
        if isinstance(ds.repo, AnnexRepo):
            # we need to check if added sibling an annex, and try to enable it
            # another part of the fix for #463 and #432
            try:
                exc = None
                if not ds.config.obtain(
                        'remote.{}.annex-ignore'.format(name),
                        default=False,
                        valtype=EnsureBool(),
                        store=False):
                    ds.repo.enable_remote(name)
            except (CommandError, DownloadError) as exc:
                # TODO yield
                # this is unlikely to ever happen, now done for AnnexRepo
                # instances only
                # Note: CommandError happens with git-annex
                # 6.20180416+gitg86b18966f-1~ndall+1 (prior 6.20180510, from
                # which starts to fail with AccessFailedError) if URL is bogus,
                # so enableremote fails. E.g. as "tested" in test_siblings
                lgr.info(
                    "Failed to enable annex remote %s, could be a pure git "
                    "or not accessible", name)
                lgr.debug("Exception was: %s" % exc_str(exc))

            if as_common_datasrc:
                ri = RI(url)
                if isinstance(ri, URL) and ri.scheme in ('http', 'https'):
                    # XXX what if there is already a special remote
                    # of this name? Above check for remotes ignores special
                    # remotes. we need to `git annex dead REMOTE` on reconfigure
                    # before we can init a new one
                    # XXX except it is not enough

                    # make special remote of type=git (see #335)
                    ds.repo._run_annex_command(
                        'initremote',
                        annex_options=[
                            as_common_datasrc,
                            'type=git',
                            'location={}'.format(url),
                            'autoenable=true'])
                else:
                    yield dict(
                        status='impossible',
                        name=name,
                        message='cannot configure as a common data source, '
                                'URL protocol is not http or https',
                        **result_props)
    #
    # place configure steps that also work for 'here' below
    #
    if isinstance(ds.repo, AnnexRepo):
        for prop, var in (('wanted', annex_wanted),
                          ('required', annex_required),
                          ('group', annex_group)):
            if var is not None:
                ds.repo.set_preferred_content(prop, var, '.' if name =='here' else name)
        if annex_groupwanted:
            ds.repo.set_groupwanted(annex_group, annex_groupwanted)

    if description:
        if not isinstance(ds.repo, AnnexRepo):
            result_props['status'] = 'impossible'
            result_props['message'] = 'cannot set description of a plain Git repository'
            yield result_props
            return
        ds.repo._run_annex_command('describe', annex_options=[name, description])

    # report all we know at once
    info = list(_query_remotes(ds, name, known_remotes, get_annex_info=get_annex_info))[0]
    info.update(dict(status='ok', **result_props))
    yield info
Beispiel #55
0
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=False,
            alt_sources=None):
            # TODO next ones should be there, but cannot go anywhere
            # git_opts=None,
            # git_clone_opts=None,
            # annex_opts=None,
            # annex_init_opts=None

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        dataset = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = dataset.path if dataset else None

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `add`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # Possibly do conversion from source into a git-friendly url
        # luckily GitRepo will undo any fancy file:/// url to make use of Git's
        # optimization for local clones....
        source_url = source
        source_ = _get_git_url_from_source(source)
        lgr.debug("Resolved clone source from '%s' to '%s'",
                  source, source_)
        source = source_

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            path = _get_installationpath_from_url(source)
            # since this is a relative `path`, resolve it:
            path = resolve_path(path, dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        destination_dataset = Dataset(path)
        dest_path = path

        status_kwargs = dict(
            action='install', ds=destination_dataset, logger=lgr,
            refds=refds_path, source_url=source_url)

        # important test! based on this `rmtree` will happen below after failed clone
        if exists(dest_path) and listdir(dest_path):
            if destination_dataset.is_installed():
                # check if dest was cloned from the given source before
                # this is where we would have installed this from
                guessed_sources = _get_flexible_source_candidates(
                    source, dest_path)
                # this is where it was actually installed from
                track_name, track_url = _get_tracking_source(destination_dataset)
                if track_url in guessed_sources or \
                        get_local_file_url(track_url) in guessed_sources:
                    yield get_status_dict(
                        status='notneeded',
                        message=("dataset %s was already cloned from '%s'",
                                 destination_dataset,
                                 source),
                        **status_kwargs)
                    return
            # anything else is an error
            yield get_status_dict(
                status='error',
                message='target path already exists and not empty, refuse to clone into target path',
                **status_kwargs)
            return

        if dataset is not None and relpath(path, start=dataset.path).startswith(pardir):
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, dataset),
                **status_kwargs)
            return

        # generate candidate URLs from source argument to overcome a few corner cases
        # and hopefully be more robust than git clone
        candidate_sources = []
        # combine all given sources (incl. alternatives), maintain order
        for s in [source] + assure_list(alt_sources):
            candidate_sources.extend(_get_flexible_source_candidates(s))
        candidates_str = \
            " [%d other candidates]" % (len(candidate_sources) - 1) \
            if len(candidate_sources) > 1 \
            else ''
        lgr.info("Cloning %s%s into '%s'",
                 source, candidates_str, dest_path)
        dest_path_existed = exists(dest_path)
        error_msgs = OrderedDict()  # accumulate all error messages formatted per each url
        for isource_, source_ in enumerate(candidate_sources):
            try:
                lgr.debug("Attempting to clone %s (%d out of %d candidates) to '%s'",
                          source_, isource_ + 1, len(candidate_sources), dest_path)
                GitRepo.clone(path=dest_path, url=source_, create=True)
                break  # do not bother with other sources if succeeded
            except GitCommandError as e:
                error_msgs[source_] = exc_str_ = exc_str(e)
                lgr.debug("Failed to clone from URL: %s (%s)",
                          source_, exc_str_)
                if exists(dest_path):
                    lgr.debug("Wiping out unsuccessful clone attempt at: %s",
                              dest_path)
                    # We must not just rmtree since it might be curdir etc
                    # we should remove all files/directories under it
                    rmtree(dest_path, children_only=dest_path_existed)
                # Whenever progress reporting is enabled, as it is now,
                # we end up without e.stderr since it is "processed" out by
                # GitPython/our progress handler.
                e_stderr = e.stderr
                from datalad.support.gitrepo import GitPythonProgressBar
                if not e_stderr and GitPythonProgressBar._last_error_lines:
                    e_stderr = os.linesep.join(GitPythonProgressBar._last_error_lines)
                if 'could not create work tree' in e_stderr.lower():
                    # this cannot be fixed by trying another URL
                    re_match = re.match(r".*fatal: (.*)$", e_stderr,
                                        flags=re.MULTILINE | re.DOTALL)
                    yield get_status_dict(
                        status='error',
                        message=re_match.group(1) if re_match else "stderr: " + e_stderr,
                        **status_kwargs)
                    return

        if not destination_dataset.is_installed():
            if len(error_msgs):
                error_msg = "Failed to clone from any candidate source URL. " \
                            "Encountered errors per each url were: %s"
                error_args = (error_msgs, )
            else:
                # yoh: Not sure if we ever get here but I felt that there could
                #      be a case when this might happen and original error would
                #      not be sufficient to troubleshoot what is going on.
                error_msg = "Awkward error -- we failed to clone properly. " \
                            "Although no errors were encountered, target " \
                            "dataset at %s seems to be not fully installed. " \
                            "The 'succesful' source was: %s"
                error_args = (destination_dataset.path, source_)
            yield get_status_dict(
                status='error',
                message=(error_msg, error_args),
                **status_kwargs)
            return

        if dataset is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    dest_path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r

        _handle_possible_annex_dataset(
            destination_dataset,
            reckless,
            description=description)

        # yield successful clone of the base dataset now, as any possible
        # subdataset clone down below will not alter the Git-state of the
        # parent
        yield get_status_dict(status='ok', **status_kwargs)
Beispiel #56
0
    def __call__(
            path=None,
            dataset=None,
            annex=None,
            untracked='normal',
            recursive=False,
            recursion_limit=None,
            eval_subdataset_state='full'):
        # To the next white knight that comes in to re-implement `status` as a
        # special case of `diff`. There is one fundamental difference between
        # the two commands: `status` can always use the worktree as evident on
        # disk as a contraint (e.g. to figure out which subdataset a path is in)
        # `diff` cannot do that (everything need to be handled based on a
        # "virtual" representation of a dataset hierarchy).
        # MIH concludes that while `status` can be implemented as a special case
        # of `diff` doing so would complicate and slow down both `diff` and
        # `status`. So while the apparent almost code-duplication between the
        # two commands feels wrong, the benefit is speed. Any future RF should
        # come with evidence that speed does not suffer, and complexity stays
        # on a manageable level
        ds = require_dataset(
            dataset, check_installed=True, purpose='status reporting')

        paths_by_ds = OrderedDict()
        if path:
            # sort any path argument into the respective subdatasets
            for p in sorted(assure_list(path)):
                # it is important to capture the exact form of the
                # given path argument, before any normalization happens
                # for further decision logic below
                orig_path = text_type(p)
                p = rev_resolve_path(p, dataset)
                root = rev_get_dataset_root(text_type(p))
                if root is None:
                    # no root, not possibly underneath the refds
                    yield dict(
                        action='status',
                        path=p,
                        refds=ds.path,
                        status='error',
                        message='path not underneath this dataset',
                        logger=lgr)
                    continue
                else:
                    if dataset and root == text_type(p) and \
                            not (orig_path.endswith(op.sep) or
                                 orig_path == "."):
                        # the given path is pointing to a dataset
                        # distinguish rsync-link syntax to identify
                        # the dataset as whole (e.g. 'ds') vs its
                        # content (e.g. 'ds/')
                        super_root = rev_get_dataset_root(op.dirname(root))
                        if super_root:
                            # the dataset identified by the path argument
                            # is contained in a superdataset, and no
                            # trailing path separator was found in the
                            # argument -> user wants to address the dataset
                            # as a whole (in the superdataset)
                            root = super_root

                root = ut.Path(root)
                ps = paths_by_ds.get(root, [])
                ps.append(p)
                paths_by_ds[root] = ps
        else:
            paths_by_ds[ds.pathobj] = None

        queried = set()
        content_info_cache = {}
        while paths_by_ds:
            qdspath, qpaths = paths_by_ds.popitem(last=False)
            if qpaths and qdspath in qpaths:
                # this is supposed to be a full query, save some
                # cycles sifting through the actual path arguments
                qpaths = []
            # try to recode the dataset path wrt to the reference
            # dataset
            # the path that it might have been located by could
            # have been a resolved path or another funky thing
            qds_inrefds = path_under_rev_dataset(ds, qdspath)
            if qds_inrefds is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=text_type(qdspath),
                    refds=ds.path,
                    action='status',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, qpaths),
                    logger=lgr,
                )
                continue
            elif qds_inrefds != qdspath:
                # the path this dataset was located by is not how it would
                # be referenced underneath the refds (possibly resolved
                # realpath) -> recode all paths to be underneath the refds
                qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths]
                qdspath = qds_inrefds
            if qdspath in queried:
                # do not report on a single dataset twice
                continue
            qds = Dataset(text_type(qdspath))
            for r in _yield_status(
                    qds,
                    qpaths,
                    annex,
                    untracked,
                    recursion_limit
                    if recursion_limit is not None else -1
                    if recursive else 0,
                    queried,
                    eval_subdataset_state,
                    content_info_cache):
                yield dict(
                    r,
                    refds=ds.path,
                    action='status',
                    status='ok',
                )
Beispiel #57
0
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))
            }
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds.repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres
Beispiel #58
0
    def __call__(match,
                 dataset=None,
                 search=None,
                 report=None,
                 report_matched=False,
                 format='custom',
                 regex=False):

        lgr.debug("Initiating search for match=%r and dataset %r",
                  match, dataset)
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            exc_info = sys.exc_info()
            if dataset is None:
                if not ui.is_interactive:
                    raise NoDatasetArgumentFound(
                        "No DataLad dataset found. Specify a dataset to be "
                        "searched, or run interactively to get assistance "
                        "installing a queriable superdataset."
                    )
                # none was provided so we could ask user either he possibly wants
                # to install our beautiful mega-duper-super-dataset?
                # TODO: following logic could possibly benefit other actions.
                if os.path.exists(LOCAL_CENTRAL_PATH):
                    central_ds = Dataset(LOCAL_CENTRAL_PATH)
                    if central_ds.is_installed():
                        if ui.yesno(
                            title="No DataLad dataset found at current location",
                            text="Would you like to search the DataLad "
                                 "superdataset at %r?"
                                  % LOCAL_CENTRAL_PATH):
                            pass
                        else:
                            reraise(*exc_info)
                    else:
                        raise NoDatasetArgumentFound(
                            "No DataLad dataset found at current location. "
                            "The DataLad superdataset location %r exists, "
                            "but does not contain an dataset."
                            % LOCAL_CENTRAL_PATH)
                elif ui.yesno(
                        title="No DataLad dataset found at current location",
                        text="Would you like to install the DataLad "
                             "superdataset at %r?"
                             % LOCAL_CENTRAL_PATH):
                    from datalad.api import install
                    central_ds = install(LOCAL_CENTRAL_PATH, source='///')
                    ui.message(
                        "From now on you can refer to this dataset using the "
                        "label '///'"
                    )
                else:
                    reraise(*exc_info)

                lgr.info(
                    "Performing search using DataLad superdataset %r",
                    central_ds.path
                )
                for res in central_ds.search(
                        match,
                        search=search, report=report,
                        report_matched=report_matched,
                        format=format, regex=regex):
                    yield res
                return
            else:
                raise

        cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache')
        mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL)

        meta = None
        if os.path.exists(mcache_fname):
            lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname))
            meta, checksum = pickle.load(open(mcache_fname, 'rb'))
            # TODO add more sophisticated tests to decide when the cache is no longer valid
            if checksum != ds.repo.get_hexsha():
                # errrr, try again below
                meta = None

        # don't put in 'else', as yet to be written tests above might fail and require
        # regenerating meta data
        if meta is None:
            lgr.info("Loading and caching local meta-data... might take a few seconds")
            if not exists(cache_dir):
                os.makedirs(cache_dir)

            meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False,
                                ignore_cache=False)
            # merge all info on datasets into a single dict per dataset
            meta = flatten_metadata_graph(meta)
            # extract graph, if any
            meta = meta.get('@graph', meta)
            # build simple queriable representation
            if not isinstance(meta, list):
                meta = [meta]

            # sort entries by location (if present)
            sort_keys = ('location', 'description', 'id')
            meta = sorted(meta, key=lambda m: tuple(m.get(x, "") for x in sort_keys))

            # use pickle to store the optimized graph in the cache
            pickle.dump(
                # graph plus checksum from what it was built
                (meta, ds.repo.get_hexsha()),
                open(mcache_fname, 'wb'))
            lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname))

        if report in ('', ['']):
            report = []
        elif report and not isinstance(report, list):
            report = [report]

        match = assure_list(match)
        search = assure_list(search)
        # convert all to lower case for case insensitive matching
        search = {x.lower() for x in search}

        def get_in_matcher(m):
            """Function generator to provide closure for a specific value of m"""
            mlower = m.lower()

            def matcher(s):
                return mlower in s.lower()
            return matcher

        matchers = [
            re.compile(match_).search
            if regex
            else get_in_matcher(match_)
            for match_ in match
        ]

        # location should be reported relative to current location
        # We will assume that noone chpwd while we are yielding
        ds_path_prefix = get_path_prefix(ds.path)

        # So we could provide a useful message whenever there were not a single
        # dataset with specified `--search` properties
        observed_properties = set()

        # for every meta data set
        for mds in meta:
            hit = False
            hits = [False] * len(matchers)
            matched_fields = set()
            if not mds.get('type', mds.get('schema:type', None)) == 'Dataset':
                # we are presently only dealing with datasets
                continue
            # TODO consider the possibility of nested and context/graph dicts
            # but so far we were trying to build simple lists of dicts, as much
            # as possible
            if not isinstance(mds, dict):
                raise NotImplementedError("nested meta data is not yet supported")

            # manual loop for now
            for k, v in iteritems(mds):
                if search:
                    k_lower = k.lower()
                    if k_lower not in search:
                        if observed_properties is not None:
                            # record for providing a hint later
                            observed_properties.add(k_lower)
                        continue
                    # so we have a hit, no need to track
                    observed_properties = None
                if isinstance(v, dict) or isinstance(v, list):
                    v = text_type(v)
                for imatcher, matcher in enumerate(matchers):
                    if matcher(v):
                        hits[imatcher] = True
                        matched_fields.add(k)
                if all(hits):
                    hit = True
                    # no need to do it longer than necessary
                    if not report_matched:
                        break

            if hit:
                location = mds.get('location', '.')
                report_ = matched_fields.union(report if report else {}) \
                    if report_matched else report
                if report_ == ['*']:
                    report_dict = mds
                elif report_:
                    report_dict = {k: mds[k] for k in report_ if k in mds}
                    if report_ and not report_dict:
                        lgr.debug(
                            'meta data match for %s, but no to-be-reported '
                            'properties (%s) found. Present properties: %s',
                            location, ", ".join(report_), ", ".join(sorted(mds))
                        )
                else:
                    report_dict = {}  # it was empty but not None -- asked to
                    # not report any specific field
                if isinstance(location, (list, tuple)):
                    # could be that the same dataset installed into multiple
                    # locations. For now report them separately
                    for l in location:
                        yield opj(ds_path_prefix, l), report_dict
                else:
                    yield opj(ds_path_prefix, location), report_dict

        if search and observed_properties is not None:
            import difflib
            suggestions = {
                s: difflib.get_close_matches(s, observed_properties)
                for s in search
            }
            suggestions_str = "\n ".join(
                "%s for %s" % (", ".join(choices), s)
                for s, choices in iteritems(suggestions) if choices
            )
            lgr.warning(
                "Found no properties which matched one of the one you "
                "specified (%s).  May be you meant one among: %s.\n"
                "Suggestions:\n"
                " %s",
                ", ".join(search),
                ", ".join(observed_properties),
                suggestions_str if suggestions_str.strip() else "none"
            )
Beispiel #59
0
def _get_submodules(ds, paths, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    dspath = ds.path
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(ds)
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(ds, paths):
        if contains and not any(
                sm['path'] == c or sm['path'] in c.parents for c in contains):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        # do we just need this to recurse into subdatasets, or is this a
        # real results?
        to_report = paths is None \
            or any(p == sm['path'] or p in sm['path'].parents
                   for p in paths)
        sm.update(modinfo.get(sm['path'], {}))
        if to_report and (set_property or delete_property):
            # first deletions
            for dprop in assure_list(delete_property):
                try:
                    out, err = ds.repo._git_custom_command(
                        '', ['git', 'config', '--file', '.gitmodules',
                             '--unset-all',
                             'submodule.{}.{}'.format(sm['gitmodule_name'], dprop),
                            ]
                    )
                except CommandError:
                    yield get_status_dict(
                        'subdataset',
                        status='impossible',
                        message=(
                            "Deleting subdataset property '%s' failed for "
                            "subdataset '%s', possibly did "
                            "not exist",
                            dprop, sm['gitmodule_name']),
                        logger=lgr,
                        **sm)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=sm['path'].relative_to(refds_path),
                            refds_relname=text_type(
                                sm['path'].relative_to(refds_path)
                            ).replace(os.sep, '-')))
                try:
                    out, err = ds.repo._git_custom_command(
                        '', ['git', 'config', '--file', '.gitmodules',
                             '--replace-all',
                             'submodule.{}.{}'.format(sm['gitmodule_name'], prop),
                             text_type(val),
                            ]
                    )
                except CommandError as e:  # pragma: no cover
                    # this conditional may not be possible to reach, as
                    # variable name validity is checked before and Git
                    # replaces the file completely, resolving any permission
                    # issues, if the file could be read (already done above)
                    yield get_status_dict(
                        'subdataset',
                        status='error',
                        message=(
                            "Failed to set property '%s': %s",
                            prop, exc_str(e)),
                        type='dataset',
                        logger=lgr,
                        **sm)
                    # it is up to parent code to decide whether we would continue
                    # after this

                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val
            Dataset(dspath).save(
                '.gitmodules', to_git=True,
                message='[DATALAD] modified subdataset properties')

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if to_report and (not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled)):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    Dataset(sm['path']),
                    paths,
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if to_report and (bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled)):
            yield subdsres
Beispiel #60
0
def _publish_dataset(ds, remote, refspec, paths, annex_copy_options, force=False, jobs=None,
                     transfer_data='auto', **kwargs):
    # TODO: this setup is now quite ugly. The only way `refspec` can come
    # in, is when there is a tracking branch, and we get its state via
    # `refspec`

    # define config var name for potential publication dependencies
    depvar = 'remote.{}.datalad-publish-depends'.format(remote)
    # list of remotes that are publication dependencies for the
    # target remote
    publish_depends = assure_list(ds.config.get(depvar, []))

    # remote might be set to be ignored by annex, or we might not even know yet its uuid
    # make sure we are up-to-date on this topic on all affected remotes, before
    # we start making decisions
    for r in publish_depends + [remote]:
        if not ds.config.get('.'.join(('remote', remote, 'annex-uuid')), None):
            lgr.debug("Obtain remote annex info from '%s'", r)
            ds.repo.fetch(remote=r)
            # in order to be able to use git's config to determine what to push,
            # we need to annex merge first. Otherwise a git push might be
            # rejected if involving all matching branches for example.
            # NOTE we should not use a precomputed 'is_annex' test here, as
            # each fetch could give evidence that there is an annex
            # somewhere and replace the repo class...
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.merge_annex(r)
    ds.config.reload()

    # anything that follows will not change the repo type anymore, cache
    is_annex_repo = isinstance(ds.repo, AnnexRepo)

    # Plan:
    # 1. Check if there is anything to push, and if so
    #    2. process push dependencies
    #    3. fetch and merge annex branch
    #    4. push non-annex branch(es)
    # 5. copy data to the remote if paths are provided or it wants something generally

    # upstream refspec needed for update (merge) and subsequent push,
    # in case there is no.
    # no tracking refspec yet?

    # TODO: i think this whole modification detection could be done by path
    # annotation at the very beginning -- keeping it for now to not get too
    # dizzy in the forehead....

    # if forced -- we push regardless if there are differences or not
    diff = True if force else has_diff(ds, refspec, remote, paths)

    # We might have got new information in git-annex branch although no other
    # changes
    if not diff and is_annex_repo:
        try:
            git_annex_commit = next(ds.repo.get_branch_commits('git-annex'))
        except StopIteration:
            git_annex_commit = None
        #diff = _get_remote_diff(ds, [], git_annex_commit, remote, 'git-annex')
        diff = _get_remote_diff(ds, git_annex_commit, remote, 'git-annex')
        if diff:
            lgr.info("Will publish updated git-annex")

    #
    # publish data (annex copy --to)
    #
    # # remote might be set to be ignored by annex, or we might not even know yet its uuid
    # annex_ignore = ds.config.getbool('remote.{}.annex-ignore'.format(remote), None)
    # annex_uuid = ds.config.get('remote.{}.annex-uuid'.format(remote), None)
    # if not annex_ignore:
    #     if annex_uuid is None:
    #         # most probably not yet 'known' and might require some annex

    copied_data = False
    # skip right away if data transfer is not desired
    if transfer_data != 'none' and isinstance(ds.repo, AnnexRepo):
        # publishing of `remote` might depend on publishing other
        # remote(s) first, so they need to receive the data first:
        for d in publish_depends:
            lgr.info("Transferring data to configured publication dependency: '%s'" % d)
            # properly initialized remote annex -> publish data
            for r in _publish_data(
                    ds,
                    d,
                    paths,
                    annex_copy_options,
                    force,
                    transfer_data,
                    **kwargs):
                # note if we published any data, notify to sync annex branch below
                if r['status'] == 'ok' and r['action'] == 'publish' and \
                        r.get('type', None) == 'file':
                    copied_data = True
                yield r
        # and for the main target
        for r in _publish_data(
                ds,
                remote,
                paths,
                annex_copy_options,
                force,
                transfer_data,
                **kwargs):
            # note if we published any data, notify to sync annex branch below
            if r['status'] == 'ok' and r['action'] == 'publish' and \
                    r.get('type', None) == 'file':
                copied_data = True
            yield r

    #
    # publish dataset (git push)
    #
    if not diff and not copied_data:
        lgr.debug("No changes detected with respect to state of '%s'", remote)
        yield get_status_dict(ds=ds, status='notneeded', **kwargs)
    else:
        # publishing of `remote` might depend on publishing other
        # remote(s) first:
        for d in publish_depends:
            lgr.info("Publishing to configured dependency: '%s'" % d)
            # call this again to take care of the dependency first,
            # but keep the paths the same, as the goal is to publish those
            # to the primary remote, and not anything elase to a dependency
            for r in _publish_dataset(
                    ds,
                    d,
                    # should get the same as the base dataset
                    refspec,
                    paths,
                    annex_copy_options,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **kwargs):
                yield r

        if is_annex_repo and \
                ds.repo.is_special_annex_remote(remote):
            # There is nothing else to "publish"
            lgr.debug(
                "{0} is a special annex remote, no git push is needed".format(remote)
            )
            return

        lgr.info("Publishing {0} to {1}".format(ds, remote))
        # in order to be able to use git's config to determine what to push,
        # we need to annex merge first. Otherwise a git push might be
        # rejected if involving all matching branches for example
        # even if we already fetched above we need to do it again
        if is_annex_repo:
            lgr.debug("Obtain remote annex info from '%s'", remote)
            ds.repo.fetch(remote=remote)
            ds.repo.merge_annex(remote)

        # Note: git's push.default is 'matching', which doesn't work for first
        # time publication (a branch, that doesn't exist on remote yet)
        # But if we want to respect remote.*.push entries, etc. we need to
        # not pass a specific refspec (like active branch) to `git push`
        # by default.
        # hence we amend any existing config on the fly
        # TODO: what else to push by default?
        # consider also: --follow-tags, --tags, --atomic
        # make sure we push
        things2push = []
        current_branch = ds.repo.get_active_branch()
        if current_branch:  # possibly make this conditional on a switch
            # TODO: this should become it own helper
            if is_annex_repo:
                # annex could manage this branch
                if current_branch.startswith('annex/direct') \
                        and ds.config.getbool('annex', 'direct', default=False):
                    # this is a "fake" annex direct mode branch
                    # we want to publish the underlying branch
                    current_branch = current_branch[12:]
                match_adjusted = re.match(
                    'adjusted/(.*)\([a-z]*\)',
                    current_branch)
                if match_adjusted:
                    # adjusted/master(...)
                    # TODO:  this code is not tested
                    # see https://codecov.io/gh/datalad/datalad/src/17e67045a088ae0372b38aa4d8d46ecf7c821cb7/datalad/distribution/publish.py#L156
                    # and thus probably broken -- test me!
                    current_branch = match_adjusted.group(1)
            things2push.append(current_branch)
        if is_annex_repo:
            things2push.append('git-annex')
        # check that all our magic found valid branches
        things2push = [t for t in things2push if t in ds.repo.get_branches()]
        # check that we don't ask to push things that are already configured
        # -> would cause error
        # TODO need to find a way to properly do this, when wildcards are used
        # in the push configuration variable
        things2push = [t for t in things2push
                       if t not in ds.config.get('remote.{}.push'.format(remote), [])]
        # now we know what to push where
        status, msg = _push(ds, remote, things2push, force)
        yield get_status_dict(ds=ds, status=status, message=msg, **kwargs)