Beispiel #1
0
def test_known_failure_v6():

    @known_failure_v6
    def failing():
        raise AssertionError("Failed")

    from datalad import cfg

    v6 = cfg.obtain("datalad.repo.version") == 6
    skip = cfg.obtain("datalad.tests.knownfailures.skip")
    probe = cfg.obtain("datalad.tests.knownfailures.probe")

    if v6:
        if skip:
            # skipping takes precedence over probing
            failing()
        elif probe:
            # if we probe a known failure it's okay to fail:
            failing()
        else:
            # not skipping and not probing results in the original failure:
            assert_raises(AssertionError, failing)

    else:
        # behaves as if it wasn't decorated at all, no matter what
        assert_raises(AssertionError, failing)
Beispiel #2
0
def test_probe_known_failure():

    # Note: we can't test the switch "datalad.tests.knownfailures.probe"
    # directly, since it was evaluated in the decorator already. So we need
    # to have different assertions in this test based on config and have it
    # tested across builds, which use different settings for that switch.

    @probe_known_failure
    def not_failing():
        pass

    @probe_known_failure
    def failing():
        raise AssertionError("Failed")

    from datalad import cfg
    switch = cfg.obtain("datalad.tests.knownfailures.probe")

    if switch:
        # if probing is enabled the failing is considered to be expected and
        # therefore the decorated function doesn't actually fail:
        failing()
        # in opposition a function that doesn't fail raises an AssertionError:
        assert_raises(AssertionError, not_failing)
    else:
        # if probing is disabled it should just fail/pass as is:
        assert_raises(AssertionError, failing)
        not_failing()
Beispiel #3
0
def get_oracle_db(
        dbserver=None,
        port=1521,
        sid='ORCL',
        credential=None):
    dbserver = dbserver or cfg.obtain('datalad.externals.nda.dbserver',
                                      default=DEFAULT_SERVER)
    # This specific username has access to the 'Image' selection of NDA as of about today
    #username = username \
    #           or cfg.get('externals:nda', 'username',
    #                default='halchenkoy_103924')
    if not credential:
        providers = Providers.from_config_files()
        credential = providers.get_provider(DEFAULT_SERVER).credential

    if not isinstance(credential, dict):
        credential = credential()

    import cx_Oracle   # you must have the beast if you want to access the dark side
    dsnStr = cx_Oracle.makedsn(dbserver, port, sid)
    db = cx_Oracle.connect(user=credential['user'],
                           password=credential['password'],
                           dsn=dsnStr)

    return db
Beispiel #4
0
def test_subdatasets(path):
    # from scratch
    ds = Dataset(path)
    assert_false(ds.is_installed())
    eq_(ds.subdatasets(), [])
    ds = ds.create()
    assert_true(ds.is_installed())
    eq_(ds.subdatasets(), [])
    # create some file and commit it
    open(os.path.join(ds.path, 'test'), 'w').write('some')
    ds.add(path='test')
    assert_true(ds.is_installed())
    ds.save("Hello!", version_tag=1)
    # Assuming that tmp location was not under a super-dataset
    eq_(ds.get_superdataset(), None)
    eq_(ds.get_superdataset(topmost=True), ds)

    # add itself as a subdataset (crazy, isn't it?)
    subds = ds.install('subds', source=path,
        result_xfm='datasets', return_type='item-or-list')
    assert_true(subds.is_installed())
    eq_(subds.get_superdataset(), ds)
    eq_(subds.get_superdataset(topmost=True), ds)

    subdss = ds.subdatasets()
    eq_(len(subdss), 1)
    eq_(subds.path, ds.subdatasets(result_xfm='paths')[0])
    eq_(subdss, ds.subdatasets(recursive=True))
    eq_(subdss, ds.subdatasets(fulfilled=True))
    ds.save("with subds", version_tag=2)
    ds.recall_state(1)
    assert_true(ds.is_installed())
    eq_(ds.subdatasets(), [])

    # very nested subdataset to test topmost
    subsubds = subds.install(
        _path_('d1/subds'), source=path,
        result_xfm='datasets', return_type='item-or-list')
    assert_true(subsubds.is_installed())
    eq_(subsubds.get_superdataset(), subds)
    # by default, it will only report a subperdataset that actually
    # has the queries dataset as a registered true subdataset
    eq_(subsubds.get_superdataset(topmost=True), subds)
    # by we can also ask for a dataset that is merely above
    eq_(subsubds.get_superdataset(topmost=True, registered_only=False), ds)

    # verify that '^' alias would work
    with chpwd(subsubds.path):
        dstop = Dataset('^')
        eq_(dstop, subds)
        # and while in the dataset we still can resolve into central one
        dscentral = Dataset('///')
        eq_(dscentral.path,
            cfg.obtain('datalad.locations.default-dataset'))

    with chpwd(ds.path):
        dstop = Dataset('^')
        eq_(dstop, ds)
Beispiel #5
0
def test_known_failure():

    @known_failure
    def failing():
        raise AssertionError("Failed")

    from datalad import cfg

    skip = cfg.obtain("datalad.tests.knownfailures.skip")
    probe = cfg.obtain("datalad.tests.knownfailures.probe")

    if skip:
        # skipping takes precedence over probing
        failing()
    elif probe:
        # if we probe a known failure it's okay to fail:
        failing()
    else:
        # not skipping and not probing results in the original failure:
        assert_raises(AssertionError, failing)
Beispiel #6
0
def get_url_cache_filename(url, name=None):
    """Return a filename where to cache online doc from a url"""
    if not name:
        name = "misc"
    cache_dir = opj(cfg.obtain('datalad.locations.cache'), name)
    doc_fname = opj(
        cache_dir,
        '{}-{}.p{}'.format(
            urlsplit(url).netloc,
            md5(url.encode('utf-8')).hexdigest(),
            pickle.HIGHEST_PROTOCOL)
    )
    return doc_fname
Beispiel #7
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            raise AttributeError

        # mirror what is happening in __init__
        if isinstance(path, ut.PurePath):
            path = text_type(path)

        # Custom handling for few special abbreviations
        path_ = path
        if path == '^':
            # get the topmost dataset from current location. Note that 'zsh'
            # might have its ideas on what to do with ^, so better use as -d^
            path_ = Dataset(curdir).get_superdataset(topmost=True).path
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        path_ = RI(path_).localpath

        # we want an absolute path, but no resolved symlinks
        if not isabs(path_):
            path_ = opj(getpwd(), path_)

        # use canonical paths only:
        path_ = normpath(path_)
        kwargs['path'] = path_
        return path_, args, kwargs
Beispiel #8
0
 def _flyweight_preproc_path(cls, path):
     """Custom handling for few special abbreviations for datasets"""
     path_ = path
     if path == '^':
         # get the topmost dataset from current location. Note that 'zsh'
         # might have its ideas on what to do with ^, so better use as -d^
         path_ = Dataset(get_dataset_root(curdir)).get_superdataset(
             topmost=True).path
     elif path == '^.':
         # get the dataset containing current directory
         path_ = get_dataset_root(curdir)
     elif path == '///':
         # TODO: logic/UI on installing a default dataset could move here
         # from search?
         path_ = cfg.obtain('datalad.locations.default-dataset')
     if path != path_:
         lgr.debug("Resolved dataset alias %r to path %r", path, path_)
     return path_
 def __call__(self):
     """Obtain credentials from a keyring and if any is not known -- ask"""
     fields = {}
     # check if we shall ask for credentials, even if some are on record
     # already (but maybe they were found to need updating)
     force_reentry = dlcfg.obtain(
         'datalad.credentials.force-ask',
         valtype=anything2bool)
     for f in self._FIELDS:
         # don't query for value if we need to get a new one
         v = None if force_reentry else self._get_field_value(f)
         if not self._is_field_optional(f):
             while v is None:  # was not known
                 v = self._ask_and_set(f)
             fields[f] = v
         elif v is not None:
             fields[f] = v
     return fields
Beispiel #10
0
    def _flyweight_id_from_args(cls, *args, **kwargs):

        if args:
            # to a certain degree we need to simulate an actual call to __init__
            # and make sure, passed arguments are fitting:
            # TODO: Figure out, whether there is a cleaner way to do this in a
            # generic fashion
            assert ('path' not in kwargs)
            path = args[0]
            args = args[1:]
        elif 'path' in kwargs:
            path = kwargs.pop('path')
        else:
            raise TypeError("__init__() requires argument `path`")

        if path is None:
            raise AttributeError

        # Custom handling for few special abbreviations
        path_ = path
        if path == '^':
            # get the topmost dataset from current location. Note that 'zsh'
            # might have its ideas on what to do with ^, so better use as -d^
            path_ = Dataset(curdir).get_superdataset(topmost=True).path
        elif path == '///':
            # TODO: logic/UI on installing a default dataset could move here
            # from search?
            path_ = cfg.obtain('datalad.locations.default-dataset')
        if path != path_:
            lgr.debug("Resolved dataset alias %r to path %r", path, path_)

        # Sanity check for argument `path`:
        # raise if we cannot deal with `path` at all or
        # if it is not a local thing:
        path_ = RI(path_).localpath

        # we want an absolute path, but no resolved symlinks
        if not isabs(path_):
            path_ = opj(getpwd(), path_)

        # use canonical paths only:
        path_ = normpath(path_)
        kwargs['path'] = path_
        return path_, args, kwargs
Beispiel #11
0
    def format_oneline_tb(self, limit=None, include_str=True):
        """Format an exception traceback as a one-line summary

        Returns a string of the form [filename:contextname:linenumber, ...].
        If include_str is True (default), this is prepended with the string
        representation of the exception.
        """

        # Note: No import at module level, since ConfigManager imports
        # dochelpers -> circular import when creating datalad.cfg instance at
        # startup.
        from datalad import cfg

        if include_str:
            # try exc message
            leading = str(self.tb)
            if not leading:
                # go with type
                leading = self.tb.exc_type.__qualname__
            out = "{} ".format(leading)
        else:
            out = ""

        if limit is None:
            # TODO: config logging.exceptions.traceback_levels = 1
            #       ^ This is taken from exc_str(). What exactly does it mean?
            #         Controlling the tblimit differently for logging, result
            #         reporting, whatever else?
            limit = int(cfg.obtain('datalad.exc.str.tblimit', default=1))

        entries = []
        entries.extend(self.tb.stack)
        if self.tb.__cause__:
            entries.extend(self.tb.__cause__.stack)
        elif self.tb.__context__ and not self.tb.__suppress_context__:
            entries.extend(self.tb.__context__.stack)

        if entries:
            tb_str = "[%s]" % (','.join("{}:{}:{}".format(
                Path(frame_summary.filename).name, frame_summary.name,
                frame_summary.lineno) for frame_summary in entries[-limit:]))
            out += "{}".format(tb_str)

        return out
Beispiel #12
0
 def _get_result_filter(cls, args):
     from datalad import cfg
     result_filter = None
     if args.common_report_status or 'datalad.runtime.report-status' in cfg:
         report_status = args.common_report_status or \
                         cfg.obtain('datalad.runtime.report-status')
         if report_status == "all":
             pass  # no filter
         elif report_status == 'success':
             result_filter = EnsureKeyChoice('status', ('ok', 'notneeded'))
         elif report_status == 'failure':
             result_filter = EnsureKeyChoice('status',
                                             ('impossible', 'error'))
         else:
             result_filter = EnsureKeyChoice('status', (report_status,))
     if args.common_report_type:
         tfilt = EnsureKeyChoice('type', tuple(args.common_report_type))
         result_filter = result_filter & tfilt if result_filter else tfilt
     return result_filter
Beispiel #13
0
def test_search_outside1(tdir, newhome):
    with chpwd(tdir):
        # should fail since directory exists, but not a dataset
        # should not even waste our response ;)
        always_render = cfg.obtain('datalad.api.alwaysrender')
        with patch.object(search_mod, 'LOCAL_CENTRAL_PATH', newhome):
            if always_render:
                # we do try to render results which actually causes exception
                # to come right away
                assert_raises(NoDatasetArgumentFound, search, "bu")
            else:
                gen = search("bu")
                assert_is_generator(gen)
                assert_raises(NoDatasetArgumentFound, next, gen)

        # and if we point to some non-existing dataset -- the same in both cases
        # but might come before even next if always_render
        with assert_raises(ValueError):
            next(search("bu", dataset=newhome))
Beispiel #14
0
 def _get_result_filter(cls, args):
     from datalad import cfg
     result_filter = None
     if args.common_report_status or 'datalad.runtime.report-status' in cfg:
         report_status = args.common_report_status or \
                         cfg.obtain('datalad.runtime.report-status')
         if report_status == "all":
             pass  # no filter
         elif report_status == 'success':
             result_filter = EnsureKeyChoice('status', ('ok', 'notneeded'))
         elif report_status == 'failure':
             result_filter = EnsureKeyChoice('status',
                                             ('impossible', 'error'))
         else:
             result_filter = EnsureKeyChoice('status', (report_status, ))
     if args.common_report_type:
         tfilt = EnsureKeyChoice('type', tuple(args.common_report_type))
         result_filter = result_filter & tfilt if result_filter else tfilt
     return result_filter
Beispiel #15
0
def known_failure_direct_mode(func):
    """Test decorator marking a test as known to fail in a direct mode test run

    If datalad.repo.direct is set to True behaves like `known_failure`.
    Otherwise the original (undecorated) function is returned.
    """

    from datalad import cfg

    direct = cfg.obtain("datalad.repo.direct") or on_windows
    if direct:

        @known_failure
        @wraps(func)
        def dm_func(*args, **kwargs):
            return func(*args, **kwargs)

        return dm_func

    return func
Beispiel #16
0
def known_failure_v6(func):
    """Test decorator marking a test as known to fail in a v6 test run

    If datalad.repo.version is set to 6 behaves like `known_failure`. Otherwise
    the original (undecorated) function is returned.
    """

    from datalad import cfg

    version = cfg.obtain("datalad.repo.version")
    if version and version == 6:

        @known_failure
        @wraps(func)
        def v6_func(*args, **kwargs):
            return func(*args, **kwargs)

        return v6_func

    return func
Beispiel #17
0
def test_skip_known_failure():

    # Note: we can't test the switch "datalad.tests.knownfailures.skip"
    # directly, since it was evaluated in the decorator already. So we need
    # to have different assertions in this test based on config and have it
    # tested across builds, which use different settings for that switch.

    @skip_known_failure
    def failing():
        raise AssertionError("Failed")

    switch = dl_cfg.obtain("datalad.tests.knownfailures.skip")

    if switch:
        # if skipping is enabled, we shouldn't see the exception:
        failing()
    else:
        # if it's disabled, failing() is executed and therefore exception
        # is raised:
        assert_raises(AssertionError, failing)
Beispiel #18
0
def check_datasets_datalad_org(suffix, tdir):
    # Test that git annex / datalad install, get work correctly on our datasets.datalad.org
    # Apparently things can break, especially with introduction of the
    # smart HTTP backend for apache2 etc
    ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix)
    eq_(ds.config.get('remote.origin.annex-ignore', None), None)
    # assert_result_count and not just assert_status since for some reason on
    # Windows we get two records due to a duplicate attempt (as res[1]) to get it
    # again, which is reported as "notneeded".  For the purpose of this test
    # it doesn't make a difference.
    # git-annex version is not "real" - but that is about when fix was introduced
    from datalad import cfg
    if on_windows \
        and cfg.obtain("datalad.repo.version") < 6 \
        and external_versions['cmd:annex'] <= '7.20181203':
        raise SkipTest("Known to fail, needs fixed git-annex")
    assert_result_count(
        ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')),
        1,
        status='ok')
    assert_status('ok', ds.remove())
Beispiel #19
0
def check_datasets_datalad_org(suffix, tdir):
    # Test that git annex / datalad install, get work correctly on our datasets.datalad.org
    # Apparently things can break, especially with introduction of the
    # smart HTTP backend for apache2 etc
    ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix)
    eq_(ds.config.get('remote.origin.annex-ignore', None), None)
    # assert_result_count and not just assert_status since for some reason on
    # Windows we get two records due to a duplicate attempt (as res[1]) to get it
    # again, which is reported as "notneeded".  For the purpose of this test
    # it doesn't make a difference.
    # git-annex version is not "real" - but that is about when fix was introduced
    from datalad import cfg
    if on_windows \
        and cfg.obtain("datalad.repo.version") < 6 \
        and external_versions['cmd:annex'] <= '7.20181203':
        raise SkipTest("Known to fail, needs fixed git-annex")
    assert_result_count(
        ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')),
        1,
        status='ok')
    assert_status('ok', ds.remove())
Beispiel #20
0
def test_skip_known_failure():

    # Note: we can't test the switch "datalad.tests.knownfailures.skip"
    # directly, since it was evaluated in the decorator already. So we need
    # to have different assertions in this test based on config and have it
    # tested across builds, which use different settings for that switch.

    @skip_known_failure
    def failing():
        raise AssertionError("Failed")

    from datalad import cfg
    switch = cfg.obtain("datalad.tests.knownfailures.skip")

    if switch:
        # if skipping is enabled, we shouldn't see the exception:
        failing()
    else:
        # if it's disabled, failing() is executed and therefore exception
        # is raised:
        assert_raises(AssertionError, failing)
Beispiel #21
0
def _get_ssh_version(exe=None):
    """Return version of ssh

    Annex prior 20170302 was using bundled version, then across all systems
    we used system one if installed, and then switched to the one defined in
    configuration, with system-wide (not default in PATH e.g. from conda)
    "forced" on Windows.  If no specific executable provided in `exe`, we will
    use the one in configuration
    """
    if exe is None:
        from datalad import cfg
        exe = cfg.obtain("datalad.ssh.executable")
    out = _runner.run([exe, '-V'], protocol=StdOutErrCapture)
    # apparently spits out to err but I wouldn't trust it blindly
    stdout = out['stdout']
    if out['stderr'].startswith('OpenSSH'):
        stdout = out['stderr']
    assert stdout.startswith(
        'OpenSSH')  # that is the only one we care about atm
    # The last item in _-separated list in the first word which could be separated
    # from the rest by , or yet have another word after space
    return stdout.split(',', 1)[0].split(' ')[0].rstrip('.').split('_')[-1]
Beispiel #22
0
    def __call__(self, instructions=None):
        """Obtain credentials from a keyring and if any is not known -- ask

        Parameters
        ----------
        instructions : str, optional
          If given, the auto-generated instructions based on a login-URL are
          replaced by the given string
        """
        fields = {}
        # check if we shall ask for credentials, even if some are on record
        # already (but maybe they were found to need updating)
        force_reentry = dlcfg.obtain('datalad.credentials.force-ask',
                                     valtype=anything2bool)
        for f in self._FIELDS:
            # don't query for value if we need to get a new one
            v = None if force_reentry else self._get_field_value(f)
            if not self._is_field_optional(f):
                while v is None:  # was not known
                    v = self._ask_and_set(f, instructions=instructions)
                fields[f] = v
            elif v is not None:
                fields[f] = v
        return fields
Beispiel #23
0
def _process_results(results, cmd_class, on_failure, action_summary,
                     incomplete_results, result_renderer, result_log_level,
                     allkwargs):
    # private helper pf @eval_results
    # loop over results generated from some source and handle each
    # of them according to the requested behavior (logging, rendering, ...)

    # used to track repeated messages in the default renderer
    last_result = None
    last_result_ts = None
    # which result dict keys to inspect for changes to discover repetitions
    # of similar messages
    repetition_keys = set(('action', 'status', 'type', 'refds'))
    # counter for detected repetitions
    result_repetitions = 0
    # how many repetitions to show, before suppression kicks in
    render_n_repetitions = \
        dlcfg.obtain('datalad.ui.suppress-similar-results-threshold') \
        if sys.stdout.isatty() \
        and dlcfg.obtain('datalad.ui.suppress-similar-results') \
        else float("inf")

    for res in results:
        if not res or 'action' not in res:
            # XXX Yarik has to no clue on how to track the origin of the
            # record to figure out WTF, so he just skips it
            # but MIH thinks leaving a trace of that would be good
            lgr.debug('Drop result record without "action": %s', res)
            continue

        actsum = action_summary.get(res['action'], {})
        if res['status']:
            actsum[res['status']] = actsum.get(res['status'], 0) + 1
            action_summary[res['action']] = actsum
        ## log message, if there is one and a logger was given
        msg = res.get('message', None)
        # remove logger instance from results, as it is no longer useful
        # after logging was done, it isn't serializable, and generally
        # pollutes the output
        res_lgr = res.pop('logger', None)
        if msg and res_lgr:
            if isinstance(res_lgr, logging.Logger):
                # didn't get a particular log function, go with default
                res_lgr = getattr(
                    res_lgr, default_logchannels[res['status']]
                    if result_log_level is None else result_log_level)
            msg = res['message']
            msgargs = None
            if isinstance(msg, tuple):
                msgargs = msg[1:]
                msg = msg[0]
            if 'path' in res:
                # result path could be a path instance
                path = str(res['path'])
                if msgargs:
                    # we will pass the msg for %-polation, so % should be doubled
                    path = path.replace('%', '%%')
                msg = '{} [{}({})]'.format(msg, res['action'], path)
            if msgargs:
                # support string expansion of logging to avoid runtime cost
                try:
                    res_lgr(msg, *msgargs)
                except TypeError as exc:
                    raise TypeError("Failed to render %r with %r from %r: %s" %
                                    (msg, msgargs, res, exc_str(exc)))
            else:
                res_lgr(msg)

        ## output rendering
        # TODO RF this in a simple callable that gets passed into this function
        if result_renderer is None or result_renderer == 'disabled':
            pass
        elif result_renderer == 'default':
            trimmed_result = {
                k: v
                for k, v in res.items() if k in repetition_keys
            }
            if res.get('status', None) != 'notneeded' \
                    and trimmed_result == last_result:
                # this is a similar report, suppress if too many, but count it
                result_repetitions += 1
                if result_repetitions < render_n_repetitions:
                    default_result_renderer(res)
                else:
                    last_result_ts = _display_suppressed_message(
                        result_repetitions, render_n_repetitions,
                        last_result_ts)
            else:
                # this one is new, first report on any prev. suppressed results
                # by number, and then render this fresh one
                last_result_ts = _display_suppressed_message(
                    result_repetitions,
                    render_n_repetitions,
                    last_result_ts,
                    final=True)
                default_result_renderer(res)
                result_repetitions = 0
            last_result = trimmed_result
        elif result_renderer in ('json', 'json_pp'):
            ui.message(
                json.dumps(
                    {k: v
                     for k, v in res.items() if k not in ('logger')},
                    sort_keys=True,
                    indent=2 if result_renderer.endswith('_pp') else None,
                    default=str))
        elif result_renderer in ('tailored', 'default'):
            if hasattr(cmd_class, 'custom_result_renderer'):
                cmd_class.custom_result_renderer(res, **allkwargs)
        elif hasattr(result_renderer, '__call__'):
            try:
                result_renderer(res, **allkwargs)
            except Exception as e:
                lgr.warning('Result rendering failed for: %s [%s]', res,
                            exc_str(e))
        else:
            raise ValueError(
                'unknown result renderer "{}"'.format(result_renderer))

        ## error handling
        # looks for error status, and report at the end via
        # an exception
        if on_failure in ('continue', 'stop') \
                and res['status'] in ('impossible', 'error'):
            incomplete_results.append(res)
            if on_failure == 'stop':
                # first fail -> that's it
                # raise will happen after the loop
                break
        yield res
    # make sure to report on any issues that we had suppressed
    _display_suppressed_message(result_repetitions,
                                render_n_repetitions,
                                last_result_ts,
                                final=True)
Beispiel #24
0
def _generate_func_api():
    """Auto detect all available interfaces and generate a function-based
       API from them
    """
    from importlib import import_module
    from inspect import isgenerator
    from collections import namedtuple
    from collections import OrderedDict
    from functools import wraps

    from datalad import cfg

    from .interface.base import update_docstring_with_parameters
    from .interface.base import get_interface_groups
    from .interface.base import get_api_name
    from .interface.base import alter_interface_docs_for_api

    def _kwargs_to_namespace(call, args, kwargs):
        """
        Given a __call__, args and kwargs passed, prepare a cmdlineargs-like
        thing
        """
        from inspect import getargspec
        argspec = getargspec(call)
        defaults = argspec.defaults
        nargs = len(argspec.args)
        assert (nargs >= len(defaults))
        # map any args to their name
        argmap = list(zip(argspec.args[:len(args)], args))
        # map defaults of kwargs to their names (update below)
        argmap += list(zip(argspec.args[-len(defaults):], defaults))
        kwargs_ = OrderedDict(argmap)
        # update with provided kwarg args
        kwargs_.update(kwargs)
        assert (nargs == len(kwargs_))
        # Get all arguments removing those possible ones used internally and
        # which shouldn't be exposed outside anyways
        [kwargs_.pop(k) for k in kwargs_ if k.startswith('_')]
        namespace = namedtuple("smth", kwargs_.keys())(**kwargs_)
        return namespace

    def call_gen(call, renderer):
        """Helper to generate a call_ for call, to use provided renderer"""

        @wraps(call)
        def call_(*args, **kwargs):
            ret1 = ret = call(*args, **kwargs)
            if isgenerator(ret):
                # At first I thought we might just rerun it for output
                # at the end, but that wouldn't work if command actually
                # has a side-effect, i.e. actually doing something
                # so we actually need to memoize all generated output and output
                # it instead
                from datalad.utils import saved_generator
                ret, ret1 = saved_generator(ret)

            renderer(ret, _kwargs_to_namespace(call, args, kwargs))
            return ret1

        # TODO: see if we could proxy the "signature" of function
        # call from the original one
        call_.__doc__ += \
            "\nNote\n----\n\n" \
            "This version of a function uses cmdline results renderer before " \
            "returning the result"
        return call_

    always_render = cfg.obtain('datalad.api.alwaysrender')
    for grp_name, grp_descr, interfaces in get_interface_groups():
        for intfspec in interfaces:
            # turn the interface spec into an instance
            mod = import_module(intfspec[0], package='datalad')
            intf = getattr(mod, intfspec[1])
            spec = getattr(intf, '_params_', dict())

            # FIXME no longer using an interface class instance
            # convert the parameter SPEC into a docstring for the function
            update_docstring_with_parameters(
                intf.__call__, spec,
                prefix=alter_interface_docs_for_api(
                    intf.__doc__),
                suffix=alter_interface_docs_for_api(
                    intf.__call__.__doc__)
            )
            globals()[get_api_name(intfspec)] = intf.__call__
            # And the one with '_' suffix which would use cmdline results
            # renderer
            if hasattr(intf, 'result_renderer_cmdline'):
                intf__ = call_gen(intf.__call__, intf.result_renderer_cmdline)
                globals()[get_api_name(intfspec) + '_'] = intf__
                if always_render:
                    globals()[get_api_name(intfspec)] = intf__
Beispiel #25
0
from itertools import chain
from functools import lru_cache

from errno import EACCES
from os.path import realpath
from threading import Lock

from functools import wraps

import fsspec
from fuse import FUSE, FuseOSError, Operations, LoggingMixIn

from datalad.support.annexrepo import AnnexRepo
from datalad import cfg

CACHE_DIR = op.join(cfg.obtain('datalad.locations.cache'), 'fuse')

if op.lexists(CACHE_DIR):
    raise RuntimeError(
        f"Please first remove {CACHE_DIR}.  We are yet to figure out how to"
        f" ensure correctly working persistent cache:"
        f" https://github.com/intake/filesystem_spec/issues/553")

# explicit blockcache instance for better control etc
import fsspec.implementations.cached
fs_block = fsspec.implementations.cached.CachingFileSystem(
        fs=fsspec.filesystem('http'), # , target_protocol='blockcache'),
        #target_protocol='blockcache',
        cache_storage=CACHE_DIR,
        #cache_check=600,
        #block_size=1024,
Beispiel #26
0
from datalad.core.distributed.clone import Clone
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import AnnexRepo
from datalad.tests.utils_pytest import (
    DEFAULT_REMOTE,
    with_tempfile,
)
from datalad.utils import (
    Path,
    better_wraps,
    ensure_list,
    optional_args,
    rmtree,
)

DATALAD_TESTS_CACHE = cfg.obtain("datalad.tests.cache")


def url2filename(url):
    """generate file/directory name from a URL"""

    # TODO: Not really important for now, but there should be a more
    #       sophisticated approach to replace. May be just everything that
    #       isn't alphanumeric? Or simply hash the URL?
    #       URL: Will include version eventually. Would need parsing to hash
    #       w/o any parameters. Having separate clones per requested version
    #       would defy point of cache, particularly wrt downloading content.
    #       Depends on usecase, of course, but immediate one is about container
    #       images -> not cheap.
    # make it a Path, too, so pathlib can raise if we are creating an invalid
    # path on some system we run the tests on.
Beispiel #27
0
    def __call__(path=None,
                 initopts=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 fake_dates=False,
                 cfg_proc=None):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        res = dict(action='create',
                   path=text_type(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", dataset,
                        text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(check_path == p or check_path in p.parents
                   for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     text_type(parentds_path),
                     [text_type(c) for c in conflict])
                })
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'
            }
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with %s (dataset) in dataset %s',
                     text_type(conflict[0]), text_type(parentds_path))
                })
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(tbds.path,
                             url=None,
                             create=True,
                             create_sanity_checks=False,
                             git_opts=initopts,
                             fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates)
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                                       persistent=True,
                                       commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'
            }
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
                'metadata/aggregate*', 'annex.largefiles', 'nothing'
            ), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(set_attrs,
                                            attrfile=op.join(
                                                '.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get('annex.largefiles',
                                             None) == 'nothing':
                tbds.repo.set_gitattributes([('**/.git*', {
                    'annex.largefiles': 'nothing'
                })])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'
                }

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        where='dataset',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(path=tbds.path, ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r
    unlink,
    rmdir,
    rmtemp,
    rmtree,
    get_tempfile_kwargs,
    on_windows,
    Path,
)
from datalad import cfg
from datalad.config import anything2bool

# fall back on patool, if a functional implementation is available
# (i.e. not on windows), it is requested, or 7z is not found
if not on_windows and (
        cfg.obtain(
            'datalad.runtime.use-patool', default=False,
            valtype=anything2bool) or not external_versions['cmd:7z']):
    from datalad.support.archive_utils_patool import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files
    )
else:
    from datalad.support.archive_utils_7z import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files
    )

lgr = logging.getLogger('datalad.support.archives')
Beispiel #29
0
def get_runner(*args, **kwargs):
    if cfg.obtain('datalad.crawl.dryrun', default=False):
        kwargs = kwargs.copy()
        kwargs['protocol'] = DryRunProtocol()
    return Runner(*args, **kwargs)
def test_invalid_call(path):
    with chpwd(path):
        # ^ Change directory so that we don't fail with an
        # InvalidGitRepositoryError if the test is executed from a git
        # worktree.

        # needs spec or discover
        assert_raises(InsufficientArgumentsError, run_procedure)
        res = run_procedure('unknown', on_failure='ignore')
        assert_true(len(res) == 1)
        assert_in_results(res, status="impossible")


# FIXME: For some reason fails to commit correctly if on windows and in direct
# mode. However, direct mode on linux works
@skip_if(cond=on_windows and cfg.obtain("datalad.repo.version") < 6)
@known_failure_direct_mode  #FIXME
@with_tree(
    tree={
        'code': {
            'datalad_test_proc.py':
            """\
import sys
import os.path as op
from datalad.api import add, Dataset

with open(op.join(sys.argv[1], 'fromproc.txt'), 'w') as f:
    f.write('hello\\n')
add(dataset=Dataset(sys.argv[1]), path='fromproc.txt')
"""
        }
Beispiel #31
0
def _get_search_index(index_dir, ds, force_reindex):
    from whoosh import index as widx
    from .metadata import agginfo_relpath
    # what is the lastest state of aggregated metadata
    metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath)
    stamp_fname = opj(index_dir, 'datalad_metadata_state')
    definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz')

    if not force_reindex and \
            exists(stamp_fname) and \
            open(stamp_fname).read() == metadata_state:
        try:
            # TODO check that the index schema is the same
            # as the one we would have used for reindexing
            # TODO support incremental re-indexing, whoosh can do it
            idx = widx.open_dir(index_dir)
            lgr.debug('Search index contains %i documents', idx.doc_count())
            return idx
        except widx.LockError as e:
            raise e
        except widx.IndexError as e:
            # Generic index error.
            # we try to regenerate
            # TODO log this
            pass
        except widx.IndexVersionError as e:  # (msg, version, release=None)
            # Raised when you try to open an index using a format that the
            # current version of Whoosh cannot read. That is, when the index
            # you're trying to open is either not backward or forward
            # compatible with this version of Whoosh.
            # we try to regenerate
            # TODO log this
            pass
        except widx.OutOfDateError as e:
            # Raised when you try to commit changes to an index which is not
            # the latest generation.
            # this should not happen here, but if it does ... KABOOM
            raise e
        except widx.EmptyIndexError as e:
            # Raised when you try to work with an index that has no indexed
            # terms.
            # we can just continue with generating an index
            pass

    lgr.info('{} search index'.format(
        'Rebuilding' if exists(index_dir) else 'Building'))

    if not exists(index_dir):
        os.makedirs(index_dir)

    schema, definitions, per_ds_defs = _get_search_schema(ds)

    idx_obj = widx.create_in(index_dir, schema)
    idx = idx_obj.writer(
        # cache size per process
        limitmb=cfg.obtain('datalad.search.indexercachesize'),
        # disable parallel indexing for now till #1927 is resolved
        ## number of processes for indexing
        #procs=multiprocessing.cpu_count(),
        ## write separate index segments in each process for speed
        ## asks for writer.commit(optimize=True)
        #multisegment=True,
    )

    # load metadata of the base dataset and what it knows about all its subdatasets
    # (recursively)
    old_idx_size = 0
    old_ds_rpath = ''
    idx_size = 0
    for res in _query_aggregated_metadata(
            reporton=ds.config.obtain(
                'datalad.metadata.searchindex-documenttype'),
            ds=ds,
            aps=[dict(path=ds.path, type='dataset')],
            # TODO expose? but this would likely only affect metadata in the
            # base dataset
            merge_mode='init',
            # MIH: I cannot see a case when we would not want recursion (within
            # the metadata)
            recursive=True):
        rpath = relpath(res['path'], start=ds.path)
        # this assumes that files are reported after each dataset report,
        # and after a subsequent dataset report no files for the previous
        # dataset will be reported again
        rtype = res['type']
        meta = res.get('metadata', {})
        meta = MetadataDict(meta)
        if rtype == 'dataset':
            if old_ds_rpath:
                lgr.info(
                    'Added %s on dataset %s',
                    single_or_plural('document',
                                     'documents',
                                     idx_size - old_idx_size,
                                     include_count=True), old_ds_rpath)
            old_idx_size = idx_size
            old_ds_rpath = rpath

            # get any custom dataset mappings
            ds_defs = per_ds_defs.get(res['path'], {})
            # now we merge all reported unique content properties (flattened representation
            # of content metadata) with the main metadata set, using the 'add' strategy
            # this way any existing metadata value of a dataset itself will be amended by
            # those coming from the content. E.g. a single dataset 'license' might be turned
            # into a sequence of unique license identifiers across all dataset components
            meta.merge_add(meta.get('unique_content_properties', {}))
            meta.pop('unique_content_properties', None)
        doc_props = dict(path=rpath,
                         type=rtype,
                         **_meta2index_dict(meta, definitions, ds_defs))
        if 'parentds' in res:
            doc_props['parentds'] = relpath(res['parentds'], start=ds.path)
        _add_document(idx, **doc_props)
        idx_size += 1

    if old_ds_rpath:
        lgr.info(
            'Added %s on dataset %s',
            single_or_plural('document',
                             'documents',
                             idx_size - old_idx_size,
                             include_count=True), old_ds_rpath)

    idx.commit(optimize=True)

    # "timestamp" the search index to allow for automatic invalidation
    with open(stamp_fname, 'w') as f:
        f.write(metadata_state)

    # dump the term/field definitions records for later introspection
    # use compressed storage, the is not point in inflating the
    # diskspace requirements
    with gzopen(definitions_fname, 'wb') as f:
        # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit'
        # or similar) and resolve terms to URLs, if anyhow possible
        jsondump2file(definitions, f)

    lgr.info('Search index contains %i documents', idx_size)
    return idx_obj
Beispiel #32
0
    ensure_bytes,
    ensure_unicode,
    unlink,
    rmdir,
    rmtemp,
    rmtree,
    get_tempfile_kwargs,
    on_windows,
    Path,
)
from datalad import cfg
from datalad.config import anything2bool

# fall back on patool, if a functional implementation is available
# (i.e. not on windows), it is requested, or 7z is not found
if not on_windows and (cfg.obtain(
        'datalad.runtime.use-patool', default=False, valtype=anything2bool)
                       or not external_versions['cmd:7z']):
    from datalad.support.archive_utils_patool import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files)
else:
    from datalad.support.archive_utils_7z import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files)

lgr = logging.getLogger('datalad.support.archives')


def decompress_file(archive, dir_, leading_directories='strip'):
Beispiel #33
0
def test_subdatasets(path):
    # from scratch
    ds = Dataset(path)
    assert_false(ds.is_installed())
    assert_raises(ValueError, ds.subdatasets)
    ds = ds.create()
    assert_true(ds.is_installed())
    eq_(ds.subdatasets(), [])
    # create some file and commit it
    open(os.path.join(ds.path, 'test'), 'w').write('some')
    ds.save(path='test', message="Hello!", version_tag=1)
    assert_true(ds.is_installed())
    # Assuming that tmp location was not under a super-dataset
    eq_(ds.get_superdataset(), None)
    eq_(ds.get_superdataset(topmost=True), ds)

    # add itself as a subdataset (crazy, isn't it?)
    subds = ds.install('subds',
                       source=path,
                       result_xfm='datasets',
                       return_type='item-or-list')
    assert_true(subds.is_installed())
    eq_(subds.get_superdataset(), ds)
    eq_(subds.get_superdataset(topmost=True), ds)

    subdss = ds.subdatasets()
    eq_(len(subdss), 1)
    eq_(subds.path, ds.subdatasets(result_xfm='paths')[0])
    eq_(subdss, ds.subdatasets(recursive=True))
    eq_(subdss, ds.subdatasets(fulfilled=True))
    ds.save(message="with subds", version_tag=2)
    ds.recall_state(1)
    assert_true(ds.is_installed())
    eq_(ds.subdatasets(), [])

    # very nested subdataset to test topmost
    subsubds = subds.install(_path_('d1/subds'),
                             source=path,
                             result_xfm='datasets',
                             return_type='item-or-list')
    assert_true(subsubds.is_installed())
    eq_(subsubds.get_superdataset(), subds)
    # by default, it will only report a subperdataset that actually
    # has the queries dataset as a registered true subdataset
    eq_(subsubds.get_superdataset(topmost=True), subds)
    # by we can also ask for a dataset that is merely above
    eq_(subsubds.get_superdataset(topmost=True, registered_only=False), ds)

    # verify that '^' alias would work
    with chpwd(subsubds.path):
        dstop = Dataset('^')
        eq_(dstop, subds)
        # and while in the dataset we still can resolve into central one
        dscentral = Dataset('///')
        eq_(dscentral.path, dl_cfg.obtain('datalad.locations.default-dataset'))

    with chpwd(ds.path):
        dstop = Dataset('^')
        eq_(dstop, ds)

    # TODO actual submodule checkout is still there

    # Test ^. (the dataset for curdir) shortcut
    # At the top should point to the top
    with chpwd(ds.path):
        dstop = Dataset('^.')
        eq_(dstop, ds)

    # and still does within subdir
    os.mkdir(opj(ds.path, 'subdir'))
    with chpwd(opj(ds.path, 'subdir')):
        dstop = Dataset('^.')
        eq_(dstop, ds)

    # within submodule will point to submodule
    with chpwd(subsubds.path):
        dstop = Dataset('^.')
        eq_(dstop, subsubds)
Beispiel #34
0
    def __call__(repo_name, repo_accession, repo_url, path=None, output=None, dataset=None):
        # we need this resource file, no point in starting without it
        itmpl_path = cfg.obtain(
            'datalad.plugin.bids2scidata.investigator.template',
            default=opj(
                dirname(datalad_neuroimaging.__file__),
                'resources', 'isatab', 'scidata_bids_investigator.txt'))

        if path and dataset is None:
            dataset = path
        dataset = require_dataset(
            dataset, purpose='metadata query', check_installed=True)

        errored = False
        dsmeta = None
        filemeta = []
        for m in metadata(
                path,
                dataset=dataset,
                # BIDS hierarchy might go across multiple dataset
                recursive=True,
                reporton='all',
                return_type='generator',
                result_renderer='disabled'):
            type = m.get('type', None)
            if type not in ('dataset', 'file'):
                continue
            if m.get('status', None) != 'ok':
                errored = errored or m.get('status', None) in ('error', 'impossible')
                yield m
                continue
            if type == 'dataset':
                if dsmeta is not None:
                    lgr.warn(
                        'Found metadata for more than one datasets, '
                        'ignoring their dataset-level metadata')
                    continue
                dsmeta = m
            elif type == 'file':
                filemeta.append(m)
        if errored:
            return

        if not dsmeta or not 'refcommit' in dsmeta:
            yield dict(
                status='error',
                message=("could not find aggregated metadata on path '%s'", path),
                path=dataset.path,
                type='dataset',
                action='bids2scidata',
                logger=lgr)
            return

        lgr.info("Metadata for %i files associated with '%s' on record in %s",
                 len(filemeta),
                 path,
                 dataset)

        if not output:
            output = 'scidata_isatab_{}'.format(dsmeta['refcommit'])

        info = convert(
            dsmeta,
            filemeta,
            output_directory=output,
            repository_info={
                'Comment[Data Repository]': repo_name,
                'Comment[Data Record Accession]': repo_accession,
                'Comment[Data Record URI]': repo_url},
        )
        if info is None:
            yield dict(
                status='error',
                message='dataset does not seem to contain relevant metadata',
                path=dataset.path,
                type='dataset',
                action='bids2scidata',
                logger=lgr)
            return

        itmpl = open(itmpl_path, encoding='utf-8').read()
        with open(opj(output, 'i_Investigation.txt'), 'w', encoding='utf-8') as ifile:
            ifile.write(
                itmpl.format(
                    datalad_version=datalad.__version__,
                    date=datetime.now().strftime('%Y/%m/%d'),
                    repo_name=repo_name,
                    repo_accession=repo_accession,
                    repo_url=repo_url,
                    **info
                ))
        yield dict(
            status='ok',
            path=abspath(output),
            # TODO add switch to make tarball/ZIP
            #type='file',
            type='directory',
            action='bids2scidata',
            logger=lgr)
Beispiel #35
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedure path and configuration

    Order of consideration is user-level, system-level, dataset,
    datalad extensions, datalad. First one found according to this order is the
    one to be returned. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you do execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Returns
    -------
    tuple
      path, name, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (m, n,) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(
                ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (m, n,) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, n, f, h in _get_procedure_implementation(name=name, ds=subds):
                yield m, n, f, h

    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(
                        entry_point.module_name,
                        'resources/procedures'),
                    name):
                yield (m, n,) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'),
            name):
        yield (m, n,) + _get_proc_config(n)
Beispiel #36
0
    def __call__(path=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 save=True,
                 annex_version=None,
                 annex_backend='MD5E',
                 native_metadata_type=None,
                 shared_access=None,
                 git_opts=None,
                 annex_opts=None,
                 annex_init_opts=None,
                 text_no_annex=None):

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD
        if path and dataset:
            # Given a path and a dataset (path) not pointing to installed
            # dataset
            if not dataset.is_installed():
                msg = "No installed dataset at %s found." % dataset.path
                dsroot = get_dataset_root(dataset.path)
                if dsroot:
                    msg += " If you meant to add to the %s dataset, use that path " \
                           "instead but remember that if dataset is provided, " \
                           "relative paths are relative to the top of the " \
                           "dataset." % dsroot
                raise ValueError(msg)

        # sanity check first
        if git_opts:
            lgr.warning(
                "`git_opts` argument is presently ignored, please complain!")
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")
            if annex_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex and declaring no "
                                 "annex repo.")
            if annex_init_opts:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "options for annex init and declaring no "
                                 "annex repo.")

        if not isinstance(force, bool):
            raise ValueError(
                "force should be bool, got %r.  Did you mean to provide a 'path'?"
                % force)
        annotated_paths = AnnotatePaths.__call__(
            # nothing given explicitly, assume create fresh right here
            path=path if path else getpwd() if dataset is None else None,
            dataset=dataset,
            recursive=False,
            action='create',
            # we need to know whether we have to check for potential
            # subdataset collision
            force_parentds_discovery=True,
            # it is absolutely OK to have something that does not exist
            unavailable_path_status='',
            unavailable_path_msg=None,
            # if we have a dataset given that actually exists, we want to
            # fail if the requested path is not in it
            nondataset_path_status='error' \
                if isinstance(dataset, Dataset) and dataset.is_installed() else '',
            on_failure='ignore')
        path = None
        for r in annotated_paths:
            if r['status']:
                # this is dealt with already
                yield r
                continue
            if path is not None:
                raise ValueError(
                    "`create` can only handle single target path or dataset")
            path = r

        if len(annotated_paths) and path is None:
            # we got something, we complained already, done
            return

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        path.update({'logger': lgr, 'type': 'dataset'})
        # just discard, we have a new story to tell
        path.pop('message', None)
        if 'parentds' in path:
            subs = Subdatasets.__call__(
                dataset=path['parentds'],
                # any known
                fulfilled=None,
                recursive=False,
                contains=path['path'],
                result_xfm='relpaths')
            if len(subs):
                path.update({
                    'status':
                    'error',
                    'message':
                    ('collision with known subdataset %s/ in dataset %s',
                     subs[0], path['parentds'])
                })
                yield path
                return

        # TODO here we need a further test that if force=True, we need to look if
        # there is a superdataset (regardless of whether we want to create a
        # subdataset or not), and if that superdataset tracks anything within
        # this directory -- if so, we need to stop right here and whine, because
        # the result of creating a repo here will produce an undesired mess

        if git_opts is None:
            git_opts = {}
        if shared_access:
            # configure `git --shared` value
            git_opts['shared'] = shared_access

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \
            else Dataset(path['path'])

        # don't create in non-empty directory without `force`:
        if isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            path.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield path
            return

        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            GitRepo(tbds.path, url=None, create=True, git_opts=git_opts)
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(tbds.path,
                               url=None,
                               create=True,
                               backend=annex_backend,
                               version=annex_version,
                               description=description,
                               git_opts=git_opts,
                               annex_opts=annex_opts,
                               annex_init_opts=annex_init_opts)

            if text_no_annex:
                git_attributes_file = opj(tbds.path, '.gitattributes')
                with open(git_attributes_file, 'a') as f:
                    f.write('* annex.largefiles=(not(mimetype=text/*))\n')
                tbrepo.add([git_attributes_file], git=True)
                tbrepo.commit("Instructed annex to add text files to git",
                              _datalad_msg=True,
                              files=[git_attributes_file])

        if native_metadata_type is not None:
            if not isinstance(native_metadata_type, list):
                native_metadata_type = [native_metadata_type]
            for nt in native_metadata_type:
                tbds.config.add('datalad.metadata.nativetype', nt)

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds.id if tbds.id is not None else uuid_id,
                        where='dataset')

        # make sure that v6 annex repos never commit content under .datalad
        with open(opj(tbds.path, '.datalad', '.gitattributes'),
                  'a') as gitattr:
            # TODO this will need adjusting, when annex'ed aggregate metadata
            # comes around
            gitattr.write(
                '# Text files (according to file --mime-type) are added directly to git.\n'
            )
            gitattr.write(
                '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n'
            )
            gitattr.write('** annex.largefiles=nothing\n')
            gitattr.write('metadata/objects/** annex.largefiles=({})\n'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.add('.datalad',
                 to_git=True,
                 save=save,
                 message='[DATALAD] new dataset')

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path \
           and tbds.repo.get_hexsha():
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.add(tbds.path,
                                 save=save,
                                 return_type='generator',
                                 result_filter=None,
                                 result_xfm=None,
                                 on_failure='ignore'):
                yield r

        path.update({'status': 'ok'})
        yield path
Beispiel #37
0
def _setup_annex_repo(path, initopts=None, fake_dates=False, description=None):
    """Create and configure a repository at `path`

    This includes a default setup of annex.largefiles.

    Parameters
    ----------
    path: str or Path
      Path of the repository
    initopts: dict, optional
      Git options to be passed to the AnnexRepo constructor
    fake_dates: bool, optional
      Passed to the AnnexRepo constructor
    description: str, optional
      Passed to the AnnexRepo constructor

    Returns
    -------
    AnnexRepo, dict
      Created repository and records for any repo component that needs to be
      passed to git-add as a result of the setup procedure.
    """
    # always come with annex when created from scratch
    tbrepo = AnnexRepo(
        path,
        create=True,
        create_sanity_checks=False,
        # do not set backend here, to avoid a dedicated commit
        backend=None,
        # None causes version to be taken from config
        version=None,
        description=description,
        git_opts=initopts,
        fake_dates=fake_dates)
    # set the annex backend in .gitattributes as a staged change
    tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                               persistent=True,
                               commit=False)
    add_to_git = {
        tbrepo.pathobj / '.gitattributes': {
            'type': 'file',
            'state': 'added',
        }
    }
    # make sure that v6 annex repos never commit content under .datalad
    attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
        'metadata/aggregate*', 'annex.largefiles',
        'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
            cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
    attrs = tbrepo.get_gitattributes(
        [op.join('.datalad', i[0]) for i in attrs_cfg])
    set_attrs = []
    for p, k, v in attrs_cfg:
        if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
            set_attrs.append((p, {k: v}))
    if set_attrs:
        tbrepo.set_gitattributes(set_attrs,
                                 attrfile=op.join('.datalad',
                                                  '.gitattributes'))

    # prevent git annex from ever annexing .git* stuff (gh-1597)
    attrs = tbrepo.get_gitattributes('.git')
    if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing':
        tbrepo.set_gitattributes([('**/.git*', {
            'annex.largefiles': 'nothing'
        })])
        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.gitattributes'] = {
            'type': 'file',
            'state': 'untracked'
        }
    return tbrepo, add_to_git
Beispiel #38
0
from datalad.api import run_procedure
from datalad.api import clean
from datalad import cfg


def test_invalid_call():
    # needs spec or discover
    assert_raises(InsufficientArgumentsError, run_procedure)
    res = run_procedure('unknown', on_failure='ignore')
    assert_true(len(res) == 1)
    assert_in_results(res, status="impossible")


# FIXME: For some reason fails to commit correctly if on windows and in direct
# mode. However, direct mode on linux works
@skip_if(cond=on_windows and cfg.obtain("datalad.repo.version") < 6)
@known_failure_direct_mode  #FIXME
@with_tree(tree={
    'code': {'datalad_test_proc.py': """\
import sys
import os.path as op
from datalad.api import add, Dataset

with open(op.join(sys.argv[1], 'fromproc.txt'), 'w') as f:
    f.write('hello\\n')
add(dataset=Dataset(sys.argv[1]), path='fromproc.txt')
"""}})
@with_tempfile
def test_basics(path, super_path):
    ds = Dataset(path).create(force=True)
    ds.run_procedure('setup_yoda_dataset')
Beispiel #39
0
def _generate_func_api():
    """Auto detect all available interfaces and generate a function-based
       API from them
    """
    from importlib import import_module
    from inspect import isgenerator
    from collections import namedtuple
    from collections import OrderedDict
    from functools import wraps

    from datalad import cfg

    from .interface.base import update_docstring_with_parameters
    from .interface.base import get_interface_groups
    from .interface.base import get_api_name
    from .interface.base import alter_interface_docs_for_api

    def _kwargs_to_namespace(call, args, kwargs):
        """
        Given a __call__, args and kwargs passed, prepare a cmdlineargs-like
        thing
        """
        from inspect import getargspec
        argspec = getargspec(call)
        defaults = argspec.defaults
        nargs = len(argspec.args)
        assert (nargs >= len(defaults))
        # map any args to their name
        argmap = list(zip(argspec.args[:len(args)], args))
        # map defaults of kwargs to their names (update below)
        argmap += list(zip(argspec.args[-len(defaults):], defaults))
        kwargs_ = OrderedDict(argmap)
        # update with provided kwarg args
        kwargs_.update(kwargs)
        assert (nargs == len(kwargs_))
        # Get all arguments removing those possible ones used internally and
        # which shouldn't be exposed outside anyways
        [kwargs_.pop(k) for k in kwargs_ if k.startswith('_')]
        namespace = namedtuple("smth", kwargs_.keys())(**kwargs_)
        return namespace

    def call_gen(call, renderer):
        """Helper to generate a call_ for call, to use provided renderer"""
        @wraps(call)
        def call_(*args, **kwargs):
            ret1 = ret = call(*args, **kwargs)
            if isgenerator(ret):
                # At first I thought we might just rerun it for output
                # at the end, but that wouldn't work if command actually
                # has a side-effect, i.e. actually doing something
                # so we actually need to memoize all generated output and output
                # it instead
                from datalad.utils import saved_generator
                ret, ret1 = saved_generator(ret)

            renderer(ret, _kwargs_to_namespace(call, args, kwargs))
            return ret1

        # TODO: see if we could proxy the "signature" of function
        # call from the original one
        call_.__doc__ += \
            "\nNote\n----\n\n" \
            "This version of a function uses cmdline results renderer before " \
            "returning the result"
        return call_

    always_render = cfg.obtain('datalad.api.alwaysrender')
    for grp_name, grp_descr, interfaces in get_interface_groups():
        for intfspec in interfaces:
            # turn the interface spec into an instance
            mod = import_module(intfspec[0], package='datalad')
            intf = getattr(mod, intfspec[1])
            spec = getattr(intf, '_params_', dict())

            # FIXME no longer using an interface class instance
            # convert the parameter SPEC into a docstring for the function
            update_docstring_with_parameters(
                intf.__call__,
                spec,
                prefix=alter_interface_docs_for_api(intf.__doc__),
                suffix=alter_interface_docs_for_api(intf.__call__.__doc__))
            globals()[get_api_name(intfspec)] = intf.__call__
            # And the one with '_' suffix which would use cmdline results
            # renderer
            if hasattr(intf, 'result_renderer_cmdline'):
                intf__ = call_gen(intf.__call__, intf.result_renderer_cmdline)
                globals()[get_api_name(intfspec) + '_'] = intf__
                if always_render:
                    globals()[get_api_name(intfspec)] = intf__
Beispiel #40
0
def _get_procedure_implementation(name='*', ds=None):
    """get potential procedure path and configuration

    Order of consideration is user-level, system-level, dataset,
    datalad extensions, datalad. First one found according to this order is the
    one to be returned. Therefore local definitions/configurations take
    precedence over ones, that come from outside (via a datalad-extension or a
    dataset with its .datalad/config). If a dataset had precedence (as it was
    before), the addition (or just an update) of a (sub-)dataset would otherwise
    surprisingly cause you do execute code different from what you defined
    within ~/.gitconfig or your local repository's .git/config.
    So, local definitions take precedence over remote ones and more specific
    ones over more general ones.

    Returns
    -------
    tuple
      path, format string, help message
    """

    ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None

    # 1. check system and user account for procedure
    for loc in (cfg.obtain('datalad.locations.user-procedures'),
                cfg.obtain('datalad.locations.system-procedures')):
        for dir in assure_list(loc):
            for m, n in _get_file_match(dir, name):
                yield (m, n,) + _get_proc_config(n)
    # 2. check dataset for procedure
    if ds is not None and ds.is_installed():
        # could be more than one
        dirs = assure_list(
                ds.config.obtain('datalad.locations.dataset-procedures'))
        for dir in dirs:
            # TODO `get` dirs if necessary
            for m, n in _get_file_match(op.join(ds.path, dir), name):
                yield (m, n,) + _get_proc_config(n, ds=ds)
        # 2.1. check subdatasets recursively
        for subds in ds.subdatasets(return_type='generator',
                                    result_xfm='datasets'):
            for m, n, f, h in _get_procedure_implementation(name=name, ds=subds):
                yield m, n, f, h

    # 3. check extensions for procedure
    # delay heavy import until here
    from pkg_resources import iter_entry_points
    from pkg_resources import resource_isdir
    from pkg_resources import resource_filename
    for entry_point in iter_entry_points('datalad.extensions'):
        # use of '/' here is OK wrt to platform compatibility
        if resource_isdir(entry_point.module_name, 'resources/procedures'):
            for m, n in _get_file_match(
                    resource_filename(
                        entry_point.module_name,
                        'resources/procedures'),
                    name):
                yield (m, n,) + _get_proc_config(n)
    # 4. at last check datalad itself for procedure
    for m, n in _get_file_match(
            resource_filename('datalad', 'resources/procedures'),
            name):
        yield (m, n,) + _get_proc_config(n)
Beispiel #41
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import agginfo_relpath
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath)
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug('Search index contains %i documents',
                          idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                lgr.warning(
                    "Cannot open existing index %s (%s), will regenerate",
                    index_dir, exc_str(e))
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass
            except ValueError as e:
                if 'unsupported pickle protocol' in str(e):
                    lgr.warning(
                        "Cannot open existing index %s (%s), will regenerate",
                        index_dir, exc_str(e))
                else:
                    raise

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(get_aggregates=True,
                                  return_type='list',
                                  result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'],
                                            start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural('document',
                                         'documents',
                                         idx_size - old_idx_size,
                                         include_count=True), old_ds_rpath)
                log_progress(lgr.info,
                             'autofieldidxbuild',
                             'Indexed dataset at %s',
                             old_ds_rpath,
                             update=1,
                             increment=True)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural('document',
                                 'documents',
                                 idx_size - old_idx_size,
                                 include_count=True), old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        log_progress(lgr.info, 'autofieldidxbuild',
                     'Done building search index')

        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj
Beispiel #42
0
def format_oneline_tb(exc, tb=None, limit=None, include_str=True):
    """Format an exception traceback as a one-line summary

    Parameters
    ----------
    exc: Exception
    tb: TracebackException, optional
      If not given, it is generated from the given exception.
    limit: int, optional
      Traceback depth limit. If not given, the config setting
      'datalad.exc.str.tblimit' will be used, or all entries
      are reported.
    include_str: bool
      If set, is True (default), the return value is prepended with a string
    representation of the exception.

    Returns
    -------
    str
      Of format [filename:contextname:linenumber, ...].
    """

    # Note: No import at module level, since ConfigManager imports
    # dochelpers -> circular import when creating datalad.cfg instance at
    # startup.
    from datalad import cfg

    if include_str:
        # try exc message else exception type
        leading = exc.message or exc.name
        out = "{} ".format(leading)
    else:
        out = ""

    if tb is None:
        tb = traceback.TracebackException.from_exception(
            exc,
            limit=limit,
            lookup_lines=True,
            capture_locals=False,
        )

    entries = []
    entries.extend(tb.stack)
    if tb.__cause__:
        entries.extend(tb.__cause__.stack)
    elif tb.__context__ and not tb.__suppress_context__:
        entries.extend(tb.__context__.stack)

    if limit is None:
        limit = int(cfg.obtain('datalad.exc.str.tblimit',
                               default=len(entries)))
    if entries:
        tb_str = "[%s]" % (','.join(
            "{}:{}:{}".format(
                Path(frame_summary.filename).name,
                frame_summary.name,
                frame_summary.lineno)
            for frame_summary in entries[-limit:])
        )
        out += "{}".format(tb_str)

    return out
Beispiel #43
0
from datalad.utils import (
    any_re_search,
    ensure_bytes,
    ensure_unicode,
    unlink,
    rmtemp,
    rmtree,
    get_tempfile_kwargs,
    on_windows,
    Path,
)
from datalad import cfg
from datalad.config import anything2bool

# fall back on patool, if requested, or 7z is not found
if (cfg.obtain(
        'datalad.runtime.use-patool', default=False, valtype=anything2bool)
        or not external_versions['cmd:7z']):
    from datalad.support.archive_utils_patool import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files)
else:
    from datalad.support.archive_utils_7z import (
        decompress_file as _decompress_file,
        # other code expects this to be here
        compress_files)

lgr = logging.getLogger('datalad.support.archives')


def decompress_file(archive, dir_, leading_directories='strip'):
Beispiel #44
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=False,
            fake_dates=False,
            cfg_proc=None
    ):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # prep for yield
        res = dict(action='create', path=text_type(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        dataset, text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(
                    check_path == p or check_path in p.parents
                    for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        text_type(parentds_path),
                        [text_type(c) for c in conflict])})
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'}
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with %s (dataset) in dataset %s',
                        text_type(conflict[0]),
                        text_type(parentds_path))})
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbds.repo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(dataset, Dataset) and dataset.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in dataset.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r