Exemple #1
0
def _describe_datalad():
    from datalad.version import __version__, __full_version__

    return {
        'version': assure_unicode(__version__),
        'full_version': assure_unicode(__full_version__),
    }
Exemple #2
0
def yield_participant_info(bids):
    for bidsvars in bids.get_collections(level='dataset')[0].to_df().to_dict(
            orient='records'):
        props = dict(id=assure_unicode(bidsvars.pop('subject')))
        for p in bidsvars:
            # take away some ambiguity
            normk = assure_unicode(p).lower()
            hk = content_metakey_map.get(normk, normk)
            val = assure_unicode(bidsvars[p])
            if hk in ('sex', 'gender'):
                if hasattr(val, 'lower'):
                    val = val.lower()
                elif isinstance(val, float) and isnan(val):
                    # pybids reports 'n/a' is NaN
                    val = 'n/a'
                val = sex_label_map.get(val, val)
            if hk == 'suffix' and val == 'participants':
                # regression in PyBIDS 0.7.1, should be fixed in 0.8
                # https://github.com/bids-standard/pybids/issues/380
                # TODO: remove workaround whenever we depend on pybids >= 0.8
                #  after verifying that it is not succeptable
                continue
            if val:
                props[hk] = val
        if props:
            yield re.compile(r'^sub-{}/.*'.format(props['id'])), props
Exemple #3
0
def _add_document(idx, **kwargs):
    idx.add_document(
        **{
            assure_unicode(k):
            assure_unicode(v) if isinstance(v, unicode_srctypes) else v
            for k, v in kwargs.items()
        })
Exemple #4
0
    def __call__(self, query, max_nresults=None, force_reindex=False):
        with self.idx_obj.searcher() as searcher:
            wquery = self.get_query(query)

            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults,
                single_or_plural('match', 'matches', max_nresults)
            )
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime,
                ' Reporting {}.'.format(
                    ('up to ' + topstr)
                    if max_nresults > 0
                    else 'all matches'
                )
                if not hits.is_empty()
                else ' No matches.'
            ))

            if not hits:
                return

            nhits = 0
            # annotate hits for full metadata report
            hits = [dict(
                path=normpath(opj(self.ds.path, hit['path'])),
                query_matched={assure_unicode(k): assure_unicode(v)
                               if isinstance(v, unicode_srctypes) else v
                               for k, v in hit.matched_terms()},
                parentds=normpath(
                    opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None,
                type=hit.get('type', None))
                for hit in hits]
            for res in query_aggregated_metadata(
                    # type is taken from hit record
                    reporton=None,
                    ds=self.ds,
                    aps=hits,
                    # never recursive, we have direct hits already
                    recursive=False):
                res.update(
                    refds=self.ds.path,
                    action='search',
                    status='ok',
                    logger=lgr,
                )
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info(
                    "Reached the limit of {}, there could be more which "
                    "were not reported.".format(topstr)
                )
Exemple #5
0
def _describe_datalad():
    from datalad.version import __version__, __full_version__

    return {
        'version': assure_unicode(__version__),
        'full_version': assure_unicode(__full_version__),
    }
Exemple #6
0
 def custom_result_renderer(res, **kwargs):  # pragma: no cover
     if not (res['status'] == 'ok' \
             and res['action'] in ('status', 'diff') \
             and res.get('state', None) != 'clean'):
         # logging reported already
         return
     from datalad.ui import ui
     # when to render relative paths:
     #  1) if a dataset arg was given
     #  2) if CWD is the refds
     refds = res.get('refds', None)
     refds = refds if kwargs.get('dataset', None) is not None \
         or refds == os.getcwd() else None
     # Note: We have to force unicode for res['path'] because
     # interface.utils encodes it on py2 before passing it to
     # custom_result_renderer().
     path = assure_unicode(res['path']) if refds is None \
         else text_type(ut.Path(res['path']).relative_to(refds))
     type_ = res.get('type', res.get('type_src', ''))
     max_len = len('untracked')
     state = res.get('state', 'unknown')
     ui.message(u'{fill}{state}: {path}{type_}'.format(
         fill=' ' * max(0, max_len - len(state)),
         state=ac.color_word(
             state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))),
         path=path,
         type_=' ({})'.format(
             ac.color_word(type_, ac.MAGENTA) if type_ else '')))
Exemple #7
0
def normalize_command(command):
    """Convert `command` to the string representation.
    """
    if isinstance(command, list):
        command = list(map(assure_unicode, command))
        if len(command) == 1 and command[0] != "--":
            # This is either a quoted compound shell command or a simple
            # one-item command. Pass it as is.
            #
            # FIXME: This covers the predominant command-line case, but, for
            # Python API callers, it means values like ["./script with spaces"]
            # requires additional string-like escaping, which is inconsistent
            # with the handling of multi-item lists (and subprocess's
            # handling). Once we have a way to detect "running from Python API"
            # (discussed in gh-2986), update this.
            command = command[0]
        else:
            if command and command[0] == "--":
                # Strip disambiguation marker. Note: "running from Python API"
                # FIXME from below applies to this too.
                command = command[1:]
            command = " ".join(shlex_quote(c) for c in command)
    else:
        command = assure_unicode(command)
    return command
Exemple #8
0
 def custom_result_renderer(res, **kwargs):  # pragma: no cover
     if not (res['status'] == 'ok' \
             and res['action'] in ('status', 'diff') \
             and res.get('state', None) != 'clean'):
         # logging reported already
         return
     from datalad.ui import ui
     # when to render relative paths:
     #  1) if a dataset arg was given
     #  2) if CWD is the refds
     refds = res.get('refds', None)
     refds = refds if kwargs.get('dataset', None) is not None \
         or refds == os.getcwd() else None
     # Note: We have to force unicode for res['path'] because
     # interface.utils encodes it on py2 before passing it to
     # custom_result_renderer().
     path = assure_unicode(res['path']) if refds is None \
         else text_type(ut.Path(res['path']).relative_to(refds))
     type_ = res.get('type', res.get('type_src', ''))
     max_len = len('untracked')
     state = res.get('state', 'unknown')
     ui.message(u'{fill}{state}: {path}{type_}'.format(
         fill=' ' * max(0, max_len - len(state)),
         state=ac.color_word(
             state,
             STATE_COLOR_MAP.get(res.get('state', 'unknown'))),
         path=path,
         type_=' ({})'.format(
             ac.color_word(type_, ac.MAGENTA) if type_ else '')))
Exemple #9
0
def normalize_command(command):
    """Convert `command` to the string representation.
    """
    if isinstance(command, list):
        command = list(map(assure_unicode, command))
        if len(command) == 1 and command[0] != "--":
            # This is either a quoted compound shell command or a simple
            # one-item command. Pass it as is.
            #
            # FIXME: This covers the predominant command-line case, but, for
            # Python API callers, it means values like ["./script with spaces"]
            # requires additional string-like escaping, which is inconsistent
            # with the handling of multi-item lists (and subprocess's
            # handling). Once we have a way to detect "running from Python API"
            # (discussed in gh-2986), update this.
            command = command[0]
        else:
            if command and command[0] == "--":
                # Strip disambiguation marker. Note: "running from Python API"
                # FIXME from below applies to this too.
                command = command[1:]
            command = join_cmdline(command)
    else:
        command = assure_unicode(command)
    return command
Exemple #10
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True, if_dirty='ignore')
    subds.repo.remove(opj('.datalad', 'config'))
    subds.save()
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True, if_dirty='ignore')
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(ds,
                        guess_type=False,
                        ignore_subdatasets=False,
                        ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東', ):
        assert_true(
            sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
Exemple #11
0
def test_aggregate_with_missing_or_duplicate_id(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subds.remove(opj('.datalad', 'config'), if_dirty='ignore')
    assert_false(exists(opj(subds.path, '.datalad', 'config')))
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second native set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    # and we know nothing subsub
    for name in ('grandchild_äöü東',):
        assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta]))

    # but search should not fail
    with swallow_outputs():
        res1 = list(search_('.', regex=True, dataset=ds))
    assert res1

    # and let's see now if we wouldn't fail if dataset is duplicate if we
    # install the same dataset twice
    subds_clone = ds.install(source=subds.path, path="subds2")
    with swallow_outputs():
        res2 = list(search_('.', regex=True, dataset=ds))
    def _get_dsmeta(self, bids):
        context = {}
        meta = {self._key2stdkey.get(k, k): v
                for k, v in bids.get_metadata(
                    opj(self.ds.path, self._dsdescr_fname)).items()}

        # TODO maybe normalize labels of standard licenses to definition URIs
        # perform mapping

        README_fname = opj(self.ds.path, 'README')
        if not meta.get('description') and exists(README_fname):
            # BIDS uses README to provide description, so if was not
            # explicitly provided to possibly override longer README, let's just
            # load README
            with open(README_fname, 'rb') as f:
                desc = assure_unicode(f.read())
            meta['description'] = desc.strip()

        # special case
        # Could be None which we can't strip so or ''
        bids_version = (meta.get('BIDSVersion', '') or '').strip()
        bids_defurl = 'http://bids.neuroimaging.io'
        if bids_version:
            bids_defurl += '/bids_spec{}.pdf'.format(bids_version)
        meta['conformsto'] = bids_defurl
        context['bids'] = {
            # not really a working URL, but BIDS doesn't provide term defs in
            # any accessible way
            '@id': '{}#'.format(bids_defurl),
            'description': 'ad-hoc vocabulary for the Brain Imaging Data Structure (BIDS) standard',
            'type': vocabulary_id,
        }
        context.update(vocabulary)
        meta['@context'] = context
        return meta
Exemple #13
0
 def __call__(path=None,
              fr='HEAD',
              to=None,
              dataset=None,
              annex=None,
              untracked='normal',
              recursive=False,
              recursion_limit=None):
     yield from diff_dataset(dataset=dataset,
                             fr=assure_unicode(fr),
                             to=assure_unicode(to),
                             constant_refs=False,
                             path=path,
                             annex=annex,
                             untracked=untracked,
                             recursive=recursive,
                             recursion_limit=recursion_limit)
Exemple #14
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []
        for f in self.paths:
            info = file_to_dict(opj(self.ds.path, f))
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info}
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
            # now pull out actual metadata
            # cannot do simple dict comprehension, because we need to beautify things a little

            meta = {}
            for ns in info:
                for key, val, props in info[ns]:
                    if not val:
                        # skip everything empty
                        continue
                    if key.count('[') > 1:
                        # this is a nested array
                        # MIH: I do not think it is worth going here
                        continue
                    if props['VALUE_IS_ARRAY']:
                        # we'll catch the actuall array values later
                        continue
                    # normalize value
                    val = assure_unicode(val)
                    # non-breaking space
                    val = val.replace(u"\xa0", ' ')

                    field, idx, qual = xmp_field_re.match(key).groups()
                    normkey = u'{}{}'.format(field, qual)
                    if '/' in key:
                        normkey = u'{0}<{1}>'.format(*normkey.split('/'))
                    if idx:
                        # array
                        arr = meta.get(normkey, [])
                        arr.append(val)
                        meta[normkey] = arr
                    else:
                        meta[normkey] = val
            # compact
            meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()}

            contentmeta.append((f, meta))

        return {
            '@context': context,
        }, \
            contentmeta
Exemple #15
0
 def __str__(self):
     from datalad.utils import assure_unicode
     to_str = "%s: " % self.__class__.__name__
     if self.cmd:
         to_str += "command '%s'" % (self.cmd, )
     if self.code:
         to_str += " failed with exitcode %d" % self.code
     to_str += "\n%s" % assure_unicode(self.msg)
     return to_str
Exemple #16
0
 def __str__(self):
     from datalad.utils import assure_unicode
     to_str = "%s: " % self.__class__.__name__
     if self.cmd:
         to_str += "command '%s'" % (self.cmd,)
     if self.code:
         to_str += " failed with exitcode %d" % self.code
     to_str += "\n%s" % assure_unicode(self.msg)
     return to_str
Exemple #17
0
def test__version__():
    # in released stage, version in the last CHANGELOG entry
    # should correspond to the one in datalad
    CHANGELOG_filename = op.join(
        op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md')
    if not op.exists(CHANGELOG_filename):
        raise SkipTest("no %s found" % CHANGELOG_filename)
    regex = re.compile(r'^## '
                       r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+'
                       r'\((?P<date>.*)\)'
                       r'\s+--\s+'
                       r'(?P<codename>.+)'
                       )
    with open(CHANGELOG_filename, 'rb') as f:
        for line in f:
            line = line.rstrip()
            if not line.startswith(b'## '):
                # The first section header we hit, must be our changelog entry
                continue
            reg = regex.match(assure_unicode(line))
            if not reg:  # first one at that level is the one
                raise AssertionError(
                    "Following line must have matched our regex: %r" % line)
            regd = reg.groupdict()
            changelog_version = regd['version']
            lv_changelog_version = LooseVersion(changelog_version)
            # we might have a suffix - sanitize
            san__version__ = __version__.rstrip('.devdirty')
            lv__version__ = LooseVersion(san__version__)
            if '???' in regd['date'] and 'will be better than ever' in regd['codename']:
                # we only have our template
                # we can only assert that its version should be higher than
                # the one we have now
                assert_greater(lv_changelog_version, lv__version__)
            else:
                # should be a "release" record
                assert_not_in('???', regd['date'])
                assert_not_in('will be better than ever', regd['codename'])
                assert_equal(__hardcoded_version__, changelog_version)
                if __hardcoded_version__ != san__version__:
                    # It was not tagged yet and Changelog should have its
                    # template record for the next release
                    assert_greater(lv_changelog_version, lv__version__)
                    assert_in('.dev', san__version__)
                else:
                    # all is good, tagged etc
                    assert_equal(lv_changelog_version, lv__version__)
                    assert_equal(changelog_version, san__version__)
                    assert_equal(__hardcoded_version__, san__version__)
            return

    raise AssertionError(
        "No log line matching our regex found in %s" % CHANGELOG_filename
    )
Exemple #18
0
def _get_untracked_content(dspath, report_untracked, paths=None):
    cmd = [
        'git',
        '--work-tree=.',
        'status',
        '--porcelain',
        # file names NULL terminated
        '-z',
        # we never want to touch submodules, they cannot be untracked
        '--ignore-submodules=all',
        # fully untracked dirs as such, the rest as files
        '--untracked={}'.format(report_untracked)
    ]
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(cmd,
                                                   log_stderr=True,
                                                   log_stdout=True,
                                                   log_online=False,
                                                   expect_stderr=False,
                                                   shell=False,
                                                   expect_fail=True)
    except CommandError as e:
        # TODO should we catch any and handle them in here?
        raise e

    if paths:
        paths = [r['path'] for r in paths]
        if len(paths) == 1 and paths[0] == dspath:
            # nothing to filter
            paths = None

    from datalad.utils import assure_unicode

    for line in stdout.split('\0'):
        if not line:
            continue
        line = assure_unicode(line)
        if not line.startswith('?? '):
            # nothing untracked, ignore, task of `diff`
            continue
        apath = opj(
            dspath,
            # strip state marker
            line[3:])
        norm_apath = normpath(apath)
        if paths and not any(norm_apath == p or path_startswith(apath, p)
                             for p in paths):
            # we got a whitelist for paths, don't report any other
            continue
        ap = dict(path=norm_apath,
                  parentds=dspath,
                  state='untracked',
                  type='directory' if isdir(apath) else 'file')
        yield ap
Exemple #19
0
def _get_untracked_content(dspath, report_untracked, paths=None):
    cmd = ['git', '--work-tree=.', 'status', '--porcelain',
           # file names NULL terminated
           '-z',
           # we never want to touch submodules, they cannot be untracked
           '--ignore-submodules=all',
           # fully untracked dirs as such, the rest as files
           '--untracked={}'.format(report_untracked)]
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(
            cmd,
            log_stderr=True,
            log_stdout=True,
            log_online=False,
            expect_stderr=False,
            shell=False,
            expect_fail=True)
    except CommandError as e:
        # TODO should we catch any and handle them in here?
        raise e

    if paths:
        paths = [r['path'] for r in paths]
        if len(paths) == 1 and paths[0] == dspath:
            # nothing to filter
            paths = None

    from datalad.utils import assure_unicode

    for line in stdout.split('\0'):
        if not line:
            continue
        line = assure_unicode(line)
        if not line.startswith('?? '):
            # nothing untracked, ignore, task of `diff`
            continue
        apath = opj(
            dspath,
            # strip state marker
            line[3:])
        norm_apath = normpath(apath)
        if paths and not any(norm_apath == p or path_startswith(apath, p)
                             for p in paths):
            # we got a whitelist for paths, don't report any other
            continue
        ap = dict(
            path=norm_apath,
            parentds=dspath,
            state='untracked',
            type='directory' if isdir(apath) else 'file')
        yield ap
Exemple #20
0
    def show_keys(self, mode=None):
        maxl = 100  # maximal line length for unique values in mode=short
        # use a dict already, later we need to map to a definition
        # meanwhile map to the values

        keys = self._get_keys(mode)

        for k in sorted(keys):
            if mode == 'name':
                print(k)
                continue

            # do a bit more
            stat = keys[k]
            uvals = stat.uvals
            if mode == 'short':
                # show only up to X uvals
                if len(stat.uvals) > 10:
                    uvals = {v for i, v in enumerate(uvals) if i < 10}
            # all unicode still scares yoh -- he will just use repr
            # def conv(s):
            #     try:
            #         return '{}'.format(s)
            #     except UnicodeEncodeError:
            #         return assure_unicode(s).encode('utf-8')
            stat.uvals_str = assure_unicode(
                "{} unique values: {}".format(
                    len(stat.uvals), ', '.join(map(repr, uvals))))
            if mode == 'short':
                if len(stat.uvals) > 10:
                    stat.uvals_str += ', ...'
                if len(stat.uvals_str) > maxl:
                    stat.uvals_str = stat.uvals_str[:maxl-4] + ' ....'
            elif mode == 'full':
                pass
            else:
                raise ValueError(
                    "Unknown value for stats. Know full and short")

            print(
                u'{k}\n in  {stat.ndatasets} datasets\n has {stat.uvals_str}'.format(
                k=k, stat=stat
            ))
        # After #2156 datasets may not necessarily carry all
        # keys in the "unique" summary
        lgr.warning('In this search mode, the reported list of metadata keys may be incomplete')
Exemple #21
0
def test_get_open_files(p):
    pobj = Path(p)
    skip_if_no_module('psutil')
    eq_(get_open_files(p), {})
    f1 = pobj / '1'
    subd = pobj / 'd'
    with f1.open() as f:
        # since lsof does not care about PWD env var etc, paths
        # will not contain symlinks, we better realpath them
        # all before comparison
        eq_(get_open_files(p, log_open=40)[str(f1.resolve())].pid, os.getpid())

    assert not get_open_files(str(subd))

    if on_windows:
        # the remainder of the test assume a certain performance.
        # however, on windows get_open_files() can be very slow
        # (e.g. the first invocation in this test (above) can easily
        # take 30-50s). It is not worth slowing the tests to
        # accomodate this issue, given we have tested proper functioning
        # in principle already above).
        return

    # if we start a process within that directory, should get informed
    from subprocess import Popen, PIPE
    from time import time
    t0 = time()
    proc = Popen([
        sys.executable, '-c',
        r'import sys; sys.stdout.write("OK\n"); sys.stdout.flush();'
        r'import time; time.sleep(10)'
    ],
                 stdout=PIPE,
                 cwd=str(subd))
    # Assure that it started and we read the OK
    eq_(assure_unicode(proc.stdout.readline().strip()), u"OK")
    assert time(
    ) - t0 < 5  # that we were not stuck waiting for process to finish
    eq_(get_open_files(p)[str(subd.resolve())].pid, proc.pid)
    eq_(get_open_files(subd)[str(subd.resolve())].pid, proc.pid)
    proc.terminate()
    assert_equal(get_open_files(str(subd)), {})
Exemple #22
0
def test_inputs_quotes_needed(path):
    ds = Dataset(path).create(force=True)
    ds.add(".")
    cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))"
    # The string form of a command works fine when the inputs/outputs have
    # spaces ...
    cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format(
        sys.executable, cmd)
    ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs")
    expected = u"!".join(
        list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) +
        ["out0"])
    with open(op.join(path, "out0")) as ifh:
        eq_(assure_unicode(ifh.read()), expected)
    # ... but the list form of a command does not. (Don't test this failure
    # with the obscure file name because we'd need to know its composition to
    # predict the failure.)
    cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"]
    ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"])
    ok_file_has_content(opj(path, "out0"), "bar.txt foo!blah.txt!out0")
Exemple #23
0
def test_inputs_quotes_needed(path):
    ds = Dataset(path).create(force=True)
    ds.save()
    cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))"
    # The string form of a command works fine when the inputs/outputs have
    # spaces ...
    cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format(
        sys.executable, cmd)
    ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs")
    expected = u"!".join(
        list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) +
        ["out0"])
    with open(op.join(path, "out0")) as ifh:
        eq_(assure_unicode(ifh.read()), expected)
    # ... but the list form of a command does not. (Don't test this failure
    # with the obscure file name because we'd need to know its composition to
    # predict the failure.)
    cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"]
    ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"])
    ok_file_has_content(op.join(path, "out0"), "bar.txt foo!blah.txt!out0")
Exemple #24
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec, includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of paths that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths) # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = assure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(
                t for t in targetpaths if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p, downward_targets, current_trace, spec,
                includeds=includeds if not includeds else includeds.intersection(
                    downward_targets))
    undiscovered_ds = [t for t in undiscovered_ds
                       if includeds and
                          path_is_subpath(t, current_trace[-1]) and
                          t in includeds]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(
                undiscovered_ds)
Exemple #25
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype',
                     'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype',
                        'frictionless_datapackage',
                        where='dataset')
    ds.save(recursive=True)
    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(
        3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'),
                    source=ds.path,
                    result_xfm='datasets',
                    return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok',
                  clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(r['query_matched']['frictionless_datapackage.name'],
                      r['metadata']['frictionless_datapackage']['name'])
Exemple #26
0
    def __call__(message=None, path=None, dataset=None,
                 all_updated=True, version_tag=None,
                 recursive=False, recursion_limit=None, super_datasets=False,
                 message_file=None
                 ):
        if not dataset and not path:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # make sure we don't treat this as a user-provided '.' argument
            path = [{'path': abspath(curdir), 'raw_input': False}]

        refds_path = Interface.get_refds_path(dataset)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_process = []
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                modified='HEAD' if not path and recursive else None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False):
                # this path was found untracked, but not explicitly given to save
                # we will silently ignore this
                continue
            got_nothing = False
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)
        lgr.log(2, "save, to_process=%r", to_process)
        if got_nothing and recursive and refds_path:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'save',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process]
            superdss = [ds.get_superdataset(topmost=True)
                        for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path: unique([ap['parentds']
                                    for ap in to_process if 'parentds' in ap])}
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique([ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(dict(
                    path=subds,
                    parentds=parentds,
                    type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(dict(
                    path=parentds,
                    type='dataset',
                    # make sure we save content of superds later on
                    process_content=True,
                    # but not do nasty things, like adding untracked content
                    # just because we discovered this dataset
                    process_updated_only=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(
                ds,
                content_by_ds[dspath],
                message=message)
            res['status'] = 'ok' if saved_state else 'notneeded'
            # MIH: let's tag even if there was nothing commit. I'd forget this
            # option too often...
            if version_tag:
                try:
                    # TODO: check whether comment below is still true after
                    # removing the log swallowing:
                    # again cannot help but force-silence low-level code, because
                    # it screams like a made man instead of allowing top-level
                    # code an orderly error report
                    ds.repo.tag(version_tag)
                    # even if we haven't saved anything
                    res['status'] = 'ok'
                    yield res
                except CommandError as e:
                    if saved_state:
                        # first we yield the result for the actual save
                        yield res
                    # and now complain that tagging didn't work
                    yield get_status_dict(
                        'save',
                        ds=ds,
                        logger=lgr,
                        status='error',
                        message=(
                            'cannot tag this version: %s',
                            e.stderr.strip()))
            else:
                yield res
Exemple #27
0
def test_aggregation(path):
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(op.join(path, 'origin')).create(force=True)
    ds.config.add('datalad.metadata.nativetype',
                  'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype',
                     'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype',
                        'frictionless_datapackage',
                        where='dataset')
    assert_status('ok', ds.save(recursive=True))
    # while we are at it: dot it again, nothing should happen
    assert_status('notneeded', ds.save(recursive=True))

    assert_repo_status(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.meta_aggregate(recursive=True, into='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 3, status='ok', action='meta_aggregate')
    # the respective super datasets see two saves, one to record the change
    # in the subdataset after its own aggregation, and one after the super
    # updated with aggregated metadata
    assert_result_count(res, 5, status='ok', action='save', type='dataset')
    # nice and tidy
    assert_repo_status(ds.path)

    # quick test of aggregate report
    aggs = ds.meta_dump(reporton='aggregates', recursive=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.meta_dump(recursive=True)
    # basic sanity check
    assert_result_count(origres, 3, type='dataset')
    assert_result_count([r for r in origres if r['path'].endswith('.json')],
                        3,
                        type='file')  # Now that we have annex.key
    # three different IDs
    eq_(
        3,
        len(
            set([
                _get_dsid_from_core_metadata(s['metadata']['metalad_core'])
                for s in origres if s['type'] == 'dataset'
            ])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(op.join(path, 'clone'),
                    source=ds.path,
                    result_xfm='datasets',
                    return_type='item-or-list')
    # ID mechanism works
    eq_(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.meta_dump()
    # basic sanity check
    assert_result_count(cloneres, 1, type='dataset')
    # payload file
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok',
                  clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(r['query_matched']['frictionless_datapackage.name'],
                      r['metadata']['frictionless_datapackage']['name'])
Exemple #28
0
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec, includeds=None):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of path that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.
    includeds : sequence, optional
      Any paths given are treated as existing subdatasets, regardless of
      whether they can be found in the filesystem. Such subdatasets will appear
      under the key of the closest existing dataset in the `spec`.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` dict in-place.
      Keys are dataset paths, values are sets of subdataset paths
    """
    # convert to set for faster lookup
    includeds = includeds if isinstance(includeds, set) else \
        set() if includeds is None else set(includeds)
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    # this edge is not done, we need to try to reach any downstream
    # dataset
    undiscovered_ds = set(t for t in targetpaths) # if t != basepath)
    # whether anything in this directory matched a targetpath
    filematch = False
    if isdir(basepath):
        for p in listdir(basepath):
            p = assure_unicode(opj(basepath, p))
            if not isdir(p):
                if p in targetpaths:
                    filematch = True
                # we cannot have anything below this one
                continue
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            # filter targets matching this downward path
            downward_targets = set(
                t for t in targetpaths if path_startswith(t, p))
            if not downward_targets:
                continue
            # remove the matching ones from the "todo" list
            undiscovered_ds.difference_update(downward_targets)
            # go one deeper
            discover_dataset_trace_to_targets(
                p, downward_targets, current_trace, spec,
                includeds=includeds if not includeds else includeds.intersection(
                    downward_targets))
    undiscovered_ds = [t for t in undiscovered_ds
                       if includeds and
                          path_is_subpath(t, current_trace[-1]) and
                          t in includeds]
    if filematch or basepath in targetpaths or undiscovered_ds:
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            subds = spec.get(p, set())
            subds.add(current_trace[i + 1])
            spec[p] = subds
        if undiscovered_ds:
            spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(
                undiscovered_ds)
Exemple #29
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    subds = ds.create('sub', force=True)
    subsubds = subds.create('subsub', force=True)
    # aggregate from bottom to top, guess native data, no compacting of graph
    # should yield 6 meta data sets, one implicit, and one native per dataset
    # and a second natiev set for the topmost dataset
    aggregate_metadata(ds, guess_native_type=True, recursive=True)
    # no only ask the top superdataset, no recursion, just reading from the cache
    meta = get_metadata(
        ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False)
    assert_equal(len(meta), 10)
    # same schema
    assert_equal(
        10,
        sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/'
             for s in meta]))
    # three different IDs
    assert_equal(3, len(set([s.get('@id') for s in meta])))
    # and we know about all three datasets
    for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta]))
    #print(meta)
    assert_equal(
        # first implicit, then two natives, then aggregate
        meta[3]['dcterms:hasPart']['@id'],
        subds.id)
    success = False
    for m in meta:
        p = m.get('dcterms:hasPart', {})
        if p.get('@id', None) == subsubds.id:
            assert_equal(opj('sub', 'subsub'), p.get('location', None))
            success = True
    assert_true(success)

    # save the toplevel dataset only (see below)
    ds.save('with aggregated meta data', all_changes=True)

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(opj(path, 'clone'), source=ds.path)
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh meta data, the implicit one for the top-most datasets should
    # differ, but the rest not
    clonemeta = get_metadata(
        clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False)

    # make sure the implicit md for the topmost come first
    assert_equal(clonemeta[0]['@id'], clone.id)
    assert_equal(clonemeta[0]['@id'], ds.id)
    assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha())
    assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha())
    # all but the implicit is identical
    assert_equal(clonemeta[1:], meta[1:])
    # the implicit md of the clone should list a dataset ID for its subds,
    # although it has not been obtained!
    assert_equal(
        clonemeta[3]['dcterms:hasPart']['@id'],
        subds.id)

    # now obtain a subdataset in the clone and the IDs should be updated
    clone.install('sub')
    partial = get_metadata(clone, guess_type=False, ignore_cache=True)
    # ids don't change
    assert_equal(partial[0]['@id'], clonemeta[0]['@id'])
    # datasets are properly connected
    assert_equal(partial[1]['dcterms:hasPart']['@id'],
                 partial[2]['@id'])

    # query smoke test
    if os.environ.get('DATALAD_TESTS_NONETWORK'):
        raise SkipTest

    assert_equal(len(list(clone.search('mother'))), 1)
    assert_equal(len(list(clone.search('MoTHER'))), 1)  # case insensitive

    child_res = list(clone.search('child'))
    assert_equal(len(child_res), 2)

    # little helper to match names
    def assert_names(res, names, path=clone.path):
        assert_equal(list(map(itemgetter(0), res)),
                     [opj(path, n) for n in names])
    # should yield (location, report) tuples
    assert_names(child_res, ['sub', 'sub/subsub'])

    # result should be identical to invoking search from api
    # and search_ should spit out locations out
    with swallow_outputs() as cmo:
        res = list(search_('child', dataset=clone))
        assert_equal(res, child_res)
        assert_in(res[0][0], cmo.out)
    # and overarching search_ just for smoke testing of processing outputs
    # and not puking (e.g. under PY3)
    with swallow_outputs() as cmo:
        assert list(search_('.', regex=True, dataset=clone))
        assert cmo.out

    # test searching among specified properties only
    assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub'])
    assert_names(clone.search('i', search='keywords'), ['.'])
    # case shouldn't matter
    assert_names(clone.search('i', search='Keywords'), ['.'])
    assert_names(clone.search('i', search=['name', 'keywords']),
                 ['.', 'sub', 'sub/subsub'])

    # without report_matched, we are getting none of the fields
    assert(all([not x for x in map(itemgetter(1), child_res)]))
    # but we would get all if asking for '*'
    assert(all([len(x) >= 9
                for x in map(itemgetter(1),
                             list(clone.search('child', report='*')))]))
    # but we would get only the matching name if we ask for report_matched
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()),
                clone.search('child', report_matched=True))),
        set([('name',)])
    )
    # and the additional field we might have asked with report
    assert_equal(
        set(map(lambda x: tuple(sorted(x[1].keys())),
                clone.search('child', report_matched=True,
                             report=['schema:type']))),
        set([('name', 'schema:type')])
    )
    # and if we ask report to be 'empty', we should get no fields
    child_res_empty = list(clone.search('child', report=''))
    assert_equal(len(child_res_empty), 2)
    assert_equal(
        set(map(lambda x: tuple(x[1].keys()), child_res_empty)),
        set([tuple()])
    )

    # more tests on returned paths:
    assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub'])
    # if we clone subdataset and query for value present in it and its kid
    clone_sub = clone.install('sub')
    assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path)

    # Test 'and' for multiple search entries
    assert_equal(len(list(clone.search(['child', 'bids']))), 2)
    assert_equal(len(list(clone.search(['child', 'subsub']))), 1)
    assert_equal(len(list(clone.search(['bids', 'sub']))), 2)

    res = list(clone.search('.*', regex=True))  # with regex
    assert_equal(len(res), 3)  # one per dataset

    # we do search, not match
    assert_equal(len(list(clone.search('randchild', regex=True))), 1)
    assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1)
    assert_equal(len(list(clone.search('randchil.', regex=True))), 1)
    assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0)
    assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1)
    assert_equal(len(list(clone.search('grandchild'))), 1)
Exemple #30
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path"
            )
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type',
                                                           None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion, ap['path'],
                    [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict('add',
                                  status='notneeded',
                                  path=refds_path,
                                  type='dataset',
                                  logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert (not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(status='notneeded',
                                          message="already known subdataset",
                                          **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(ds=subds,
                                          status='error',
                                          message=getattr(e, 'stderr', None)
                                          or text_type(e),
                                          **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(ds=subds,
                                      status='ok',
                                      message='added new subdataset',
                                      **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(
                    dict(path=gitmodules_path, parentds=ds_path, type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(list(torepoadd.keys()),
                                 git=to_git if is_annex else True,
                                 **add_kw)
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({
                        k: v
                        for k, v in res.items() if k not in ('status', 'state')
                    })
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds,
                    torepoadd,
                    respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({
                        k: v
                        for k, v in r.items() if k not in ('status', 'state')
                    })

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append(
                        {k: v
                         for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(
                    respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Exemple #31
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []
        log_progress(
            lgr.info,
            'extractorxmp',
            'Start XMP metadata extraction from %s', self.ds,
            total=len(self.paths),
            label='XMP metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            absfp = opj(self.ds.path, f)
            log_progress(
                lgr.info,
                'extractorxmp',
                'Extract XMP metadata from %s', absfp,
                update=1,
                increment=True)
            info = file_to_dict(absfp)
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info}
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
            # now pull out actual metadata
            # cannot do simple dict comprehension, because we need to beautify things a little

            meta = {}
            for ns in info:
                for key, val, props in info[ns]:
                    if not val:
                        # skip everything empty
                        continue
                    if key.count('[') > 1:
                        # this is a nested array
                        # MIH: I do not think it is worth going here
                        continue
                    if props['VALUE_IS_ARRAY']:
                        # we'll catch the actuall array values later
                        continue
                    # normalize value
                    val = assure_unicode(val)
                    # non-breaking space
                    val = val.replace(u"\xa0", ' ')

                    field, idx, qual = xmp_field_re.match(key).groups()
                    normkey = u'{}{}'.format(field, qual)
                    if '/' in key:
                        normkey = u'{0}<{1}>'.format(*normkey.split('/'))
                    if idx:
                        # array
                        arr = meta.get(normkey, [])
                        arr.append(val)
                        meta[normkey] = arr
                    else:
                        meta[normkey] = val
            # compact
            meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()}

            contentmeta.append((f, meta))

        log_progress(
            lgr.info,
            'extractorxmp',
            'Finished XMP metadata extraction from %s', self.ds
        )
        return {
            '@context': context,
        }, \
            contentmeta
Exemple #32
0
def test_assure_unicode():
    ok_(isinstance(assure_unicode("m"), str))
    ok_(isinstance(assure_unicode('grandchild_äöü東'), str))
    ok_(isinstance(assure_unicode(u'grandchild_äöü東'), str))
    eq_(assure_unicode('grandchild_äöü東'), u'grandchild_äöü東')
    # now, non-utf8
    # Decoding could be deduced with high confidence when the string is
    # really encoded in that codepage
    mom_koi8r = u"мама".encode('koi8-r')
    eq_(assure_unicode(mom_koi8r), u"мама")
    eq_(assure_unicode(mom_koi8r, confidence=0.9), u"мама")
    mom_iso8859 = u'mamá'.encode('iso-8859-1')
    eq_(assure_unicode(mom_iso8859), u'mamá')
    eq_(assure_unicode(mom_iso8859, confidence=0.5), u'mamá')
    # but when we mix, it does still guess something allowing to decode:
    mixedin = mom_koi8r + u'東'.encode('iso2022_jp') + u'東'.encode('utf-8')
    ok_(isinstance(assure_unicode(mixedin), str))
    # but should fail if we request high confidence result:
    with assert_raises(ValueError):
        assure_unicode(mixedin, confidence=0.9)
    # For other, non string values, actually just returns original value
    # TODO: RF to actually "assure" or fail??  For now hardcoding that assumption
    assert assure_unicode(1) == 1
Exemple #33
0
def _any2unicode(val):
    return str_contructor(val) if isinstance(val,
                                             (int,
                                              float)) else assure_unicode(val)
Exemple #34
0
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=20,
                 show_keys=False,
                 show_query=False):
        from whoosh import qparser as qparse

        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        # where does the bunny have the eggs?
        index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad',
                        'search_index')

        idx_obj = _get_search_index(index_dir, ds, force_reindex)

        if show_keys:
            definitions_fname = opj(index_dir,
                                    'datalad_term_definitions.json.gz')
            try:
                defs = jsonload(gzopen(definitions_fname))
            except Exception as e:
                lgr.warning(
                    'No term definitions found alongside search index: %s',
                    exc_str(e))
                defs = {}

            for k in idx_obj.schema.names():
                print('{}{}'.format(
                    k,
                    ' {}'.format(defs[k] if isinstance(defs[k], dict) else
                                 '({})'.format(defs[k])) if k in defs else ''))
            return

        if not query:
            return

        with idx_obj.searcher() as searcher:
            # parse the query string, default whoosh parser ATM, could be
            # tailored with plugins
            parser = qparse.MultifieldParser(idx_obj.schema.names(),
                                             idx_obj.schema)
            # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed
            # upstream
            parser.add_plugin(qparse.FuzzyTermPlugin())
            parser.add_plugin(qparse.GtLtPlugin())
            # replace field defintion to allow for colons to be part of a field's name:
            parser.replace_plugin(
                qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):"))
            # for convenience we accept any number of args-words from the
            # shell and put them together to a single string here
            querystr = ' '.join(assure_list(query))
            # this gives a formal whoosh query
            wquery = parser.parse(querystr)

            if show_query:
                print(wquery)
                return
            # perform the actual search
            hits = searcher.search(
                wquery,
                terms=True,
                limit=max_nresults if max_nresults > 0 else None)
            # cheap way to get an approximate number of hits, without an expensive
            # scoring of all items
            # disabled: unreliable estimate, often confusing
            #nhits = hits.estimated_min_length()
            # report query stats
            topstr = '{} top {}'.format(
                max_nresults, single_or_plural('match', 'matches',
                                               max_nresults))
            lgr.info('Query completed in {} sec.{}'.format(
                hits.runtime, ' Reporting {}.'.format((
                    'up to ' + topstr) if max_nresults > 0 else 'all matches')
                if not hits.is_empty() else ' No matches.'))

            if not hits:
                return

            nhits = 0
            for hit in hits:
                res = dict(
                    action='search',
                    status='ok',
                    logger=lgr,
                    refds=ds.path,
                    # normpath to avoid trailing dot
                    path=normpath(opj(ds.path, hit['path'])),
                    query_matched={
                        assure_unicode(k): assure_unicode(v) if isinstance(
                            v, unicode_srctypes) else v
                        for k, v in hit.matched_terms()
                    },
                    metadata={
                        k: v
                        for k, v in hit.fields().items()
                        if k not in ('path', 'parentds')
                    })
                if 'parentds' in hit:
                    res['parentds'] = normpath(opj(ds.path, hit['parentds']))
                yield res
                nhits += 1

            if max_nresults and nhits == max_nresults:
                lgr.info("Reached the limit of {}, there could be more which "
                         "were not reported.".format(topstr))
Exemple #35
0
def test_wtf(topdir):
    path = opj(topdir, OBSCURE_FILENAME)
    # smoke test for now
    with swallow_outputs() as cmo:
        wtf(dataset=path, on_failure="ignore")
        assert_not_in('## dataset', cmo.out)
        assert_in('## configuration', cmo.out)
        # Those sections get sensored out by default now
        assert_not_in('user.name: ', cmo.out)
    with chpwd(path):
        with swallow_outputs() as cmo:
            wtf()
            assert_not_in('## dataset', cmo.out)
            assert_in('## configuration', cmo.out)
    # now with a dataset
    ds = create(path)
    with swallow_outputs() as cmo:
        wtf(dataset=ds.path)
        assert_in('## configuration', cmo.out)
        assert_in('## dataset', cmo.out)
        assert_in(u'path: {}'.format(ds.path), assure_unicode(cmo.out))

    # and if we run with all sensitive
    for sensitive in ('some', True):
        with swallow_outputs() as cmo:
            wtf(dataset=ds.path, sensitive=sensitive)
            # we fake those for tests anyways, but we do show cfg in this mode
            # and explicitly not showing them
            assert_in('user.name: %s' % _HIDDEN, cmo.out)

    with swallow_outputs() as cmo:
        wtf(dataset=ds.path, sensitive='all')
        assert_not_in(_HIDDEN, cmo.out)  # all is shown
        assert_in('user.name: ', cmo.out)

    # Sections selection
    #
    # If we ask for no sections and there is no dataset
    with chpwd(path):
        with swallow_outputs() as cmo:
            wtf(sections=[])
            assert_not_in('## dataset', cmo.out)
            for s in SECTION_CALLABLES:
                assert_not_in('## %s' % s.lower(), cmo.out.lower())

    # ask for a selected set
    secs = ['git-annex', 'configuration']
    with chpwd(path):
        with swallow_outputs() as cmo:
            wtf(sections=secs)
            for s in SECTION_CALLABLES:
                (assert_in if s in secs else assert_not_in)('## %s' %
                                                            s.lower(),
                                                            cmo.out.lower())
            # order should match our desired one, not alphabetical
            # but because of https://github.com/datalad/datalad/issues/3915
            # alphanum is now desired
            assert cmo.out.index('## git-annex') > cmo.out.index(
                '## configuration')

    # not achievable from cmdline is to pass an empty list of sections.
    with chpwd(path):
        with swallow_outputs() as cmo:
            wtf(sections=[])
            eq_(cmo.out.rstrip(), '# WTF')

    # and we could decorate it nicely for embedding e.g. into github issues
    with swallow_outputs() as cmo:
        wtf(sections=['dependencies'], decor='html_details')
        ok_startswith(cmo.out,
                      '<details><summary>DataLad %s WTF' % __version__)
        assert_in('## dependencies', cmo.out)

    # should result only in '# WTF'
    skip_if_no_module('pyperclip')

    # verify that it works correctly in the env/platform
    import pyperclip
    with swallow_outputs() as cmo:
        try:
            pyperclip.copy("xxx")
            pyperclip_works = pyperclip.paste().strip() == "xxx"
            wtf(dataset=ds.path, clipboard=True)
        except (AttributeError, pyperclip.PyperclipException) as exc:
            # AttributeError could come from pyperclip if no DISPLAY
            raise SkipTest(exc_str(exc))
        assert_in("WTF information of length", cmo.out)
        assert_not_in('user.name', cmo.out)
        if not pyperclip_works:
            # Some times does not throw but just fails to work
            raise SkipTest(
                "Pyperclip seems to be not functioning here correctly")
        assert_not_in('user.name', pyperclip.paste())
        assert_in(_HIDDEN, pyperclip.paste())  # by default no sensitive info
        assert_in("cmd:annex:", pyperclip.paste())  # but the content is there
Exemple #36
0
def _describe_datalad():

    return {
        'version': assure_unicode(__version__),
        'full_version': assure_unicode(__full_version__),
    }
Exemple #37
0
    def get_metadata(self, dataset, content):
        if not content:
            return {}, []
        context = {}
        contentmeta = []

        # which files to look for
        fname_match_regex = self.ds.config.get(
            'datalad.metadata.xmp.fname-match',
            '.*(jpg|jpeg|pdf|gif|tiff|tif|ps|eps|png|mp3|mp4|avi|wav)$')
        fname_match_regex = re.compile(fname_match_regex)

        log_progress(
            lgr.info,
            'extractorxmp',
            'Start XMP metadata extraction from %s',
            self.ds,
            total=len(self.paths),
            label='XMP metadata extraction',
            unit=' Files',
        )
        for f in self.paths:
            log_progress(lgr.info,
                         'extractorxmp',
                         'Extract XMP metadata from %s',
                         f,
                         update=1,
                         increment=True)
            # run basic file name filter for performance reasons
            # it is OK to let false-positives through
            if fname_match_regex.match(f, re.IGNORECASE) is None:
                continue
            absfp = opj(self.ds.path, f)
            info = file_to_dict(absfp)
            if not info:
                # got nothing, likely nothing there
                # TODO check if this is an XMP sidecar file, parse that, and assign metadata
                # to the base file
                continue
            # update vocabulary
            vocab = {
                info[ns][0][0].split(':')[0]: {
                    '@id': ns,
                    'type': vocabulary_id
                }
                for ns in info
            }
            # TODO this is dirty and assumed that XMP is internally consistent with the
            # definitions across all files -- which it likely isn't
            context.update(vocab)
            # now pull out actual metadata
            # cannot do simple dict comprehension, because we need to beautify things a little

            meta = {}
            for ns in info:
                for key, val, props in info[ns]:
                    if not val:
                        # skip everything empty
                        continue
                    if key.count('[') > 1:
                        # this is a nested array
                        # MIH: I do not think it is worth going here
                        continue
                    if props['VALUE_IS_ARRAY']:
                        # we'll catch the actuall array values later
                        continue
                    # normalize value
                    val = assure_unicode(val)
                    # non-breaking space
                    val = val.replace(u"\xa0", ' ')

                    field, idx, qual = xmp_field_re.match(key).groups()
                    normkey = u'{}{}'.format(field, qual)
                    if '/' in key:
                        normkey = u'{0}<{1}>'.format(*normkey.split('/'))
                    if idx:
                        # array
                        arr = meta.get(normkey, [])
                        arr.append(val)
                        meta[normkey] = arr
                    else:
                        meta[normkey] = val
            # compact
            meta = {
                k: v[0] if isinstance(v, list) and len(v) == 1 else v
                for k, v in meta.items()
            }

            contentmeta.append((f, meta))

        log_progress(lgr.info, 'extractorxmp',
                     'Finished XMP metadata extraction from %s', self.ds)
        return {
            '@context': context,
        }, \
            contentmeta
Exemple #38
0
    def __call__(message=None, path=None, dataset=None,
                 all_updated=True, version_tag=None,
                 recursive=False, recursion_limit=None, super_datasets=False,
                 message_file=None
                 ):
        if not dataset and not path:
            # we got nothing at all -> save what is staged in the repo in "this" directory?
            # make sure we don't treat this as a user-provided '.' argument
            path = [{'path': abspath(curdir), 'raw_input': False}]

        refds_path = Interface.get_refds_path(dataset)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_process = []
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='save',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                modified='HEAD' if not path and recursive else None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False):
                # this path was found untracked, but not explicitly given to save
                # we will silently ignore this
                continue
            got_nothing = False
            # next check should not be done during annotation, as it is possibly expensive
            # and not generally useful
            if ap.get('status', None) == 'impossible' and \
                    ap.get('state', None) == 'absent' and \
                    ap.get('parentds', None):
                # this is not here anymore, but it might actually have been a deleted
                # component
                if relpath(ap['path'], start=ap['parentds']) \
                        in Dataset(ap['parentds']).repo.get_deleted_files():
                    # ok, this is a staged deletion that we want to save
                    ap['status'] = ''
                    del ap['message']
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            # for things like: `ds.save()`
            # or recursively discovered datasets
            if ap['path'] == refds_path or \
                    (ap.get('type', None) == 'dataset' and
                     not ap.get('raw_input', False) and
                     not ap.get('state', None) == 'absent'):
                ap['process_content'] = True
                ap['process_updated_only'] = all_updated
            to_process.append(ap)
        lgr.log(2, "save, to_process=%r", to_process)
        if got_nothing and recursive and refds_path:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'save',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        if not to_process:
            # nothing left to do, potentially all errored before
            return

        if super_datasets:
            # search for the topmost superdatasets of any path
            dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process]
            superdss = [ds.get_superdataset(topmost=True)
                        for ds in dss]
            superdss = get_tree_roots(
                unique(ds.path for ds in dss + superdss if ds))
            if dataset:
                # need to adjust the reference to the new superds
                # if we had one ref before, we should still have exactly one
                assert len(superdss) <= 1
                dataset = list(superdss.keys())[0]
                refds_path = dataset
        elif refds_path:
            # there is a single superdataset
            superdss = {
                refds_path: unique([ap['parentds']
                                    for ap in to_process if 'parentds' in ap])}
        else:
            # sort all datasets under their potential superdatasets
            # start from the top to get all subdatasets down the line
            # and collate them into as few superdatasets as possible
            # this is quick, just string operations
            superdss = get_tree_roots(
                unique([ap['parentds'] for ap in to_process if 'parentds' in ap]))
        # for each "superdataset" check the tree of subdatasets and make sure
        # we gather all datasets between the super and any subdataset
        # so we can save them all bottom-up in order to be able to properly
        # save the superdataset
        # if this is called from e.g. `add` this is actually not necessary,
        # but in the general case we cannot avoid it
        # TODO maybe introduce a switch?
        discovered = {}
        for superds_path in superdss:
            target_subs = superdss[superds_path]
            discover_dataset_trace_to_targets(
                # from here
                superds_path,
                # to all
                target_subs,
                [],
                discovered)
        # create a new minimally annotated path for each discovered dataset
        discovered_added = set()
        for parentds in discovered:
            for subds in discovered[parentds]:
                to_process.append(dict(
                    path=subds,
                    parentds=parentds,
                    type='dataset'))
                discovered_added.add(subds)
        # make sure we have an entry for each dataset, including those
        # tha are just parents
        for parentds in discovered:
            if parentds not in discovered_added:
                to_process.append(dict(
                    path=parentds,
                    type='dataset',
                    # make sure we save content of superds later on
                    process_content=True,
                    # but not do nasty things, like adding untracked content
                    # just because we discovered this dataset
                    process_updated_only=True))

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, deduplication happens here too
        annotated_paths = AnnotatePaths.__call__(
            path=to_process,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='save',
            unavailable_path_status='',
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        # now sort into datasets so we can process them one by one
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        # iterate over all datasets, starting at the bottom
        for dspath in sorted(content_by_ds.keys(), reverse=True):
            ds = Dataset(dspath)
            res = get_status_dict('save', ds=ds, logger=lgr)
            if not ds.is_installed():
                # TODO This is likely impossible now
                res['status'] = 'impossible'
                res['message'] = ('dataset %s is not installed', ds)
                yield res
                continue
            saved_state = save_dataset(
                ds,
                content_by_ds[dspath],
                message=message)
            res['status'] = 'ok' if saved_state else 'notneeded'
            # MIH: let's tag even if there was nothing commit. I'd forget this
            # option too often...
            if version_tag:
                try:
                    # TODO: check whether comment below is still true after
                    # removing the log swallowing:
                    # again cannot help but force-silence low-level code, because
                    # it screams like a made man instead of allowing top-level
                    # code an orderly error report
                    ds.repo.tag(version_tag)
                    # even if we haven't saved anything
                    res['status'] = 'ok'
                    yield res
                except CommandError as e:
                    if saved_state:
                        # first we yield the result for the actual save
                        yield res
                    # and now complain that tagging didn't work
                    yield get_status_dict(
                        'save',
                        ds=ds,
                        logger=lgr,
                        status='error',
                        message=(
                            'cannot tag this version: %s',
                            e.stderr.strip()))
            else:
                yield res
Exemple #39
0
    def __call__(dataset=None,
                 sensitive=None,
                 sections=None,
                 decor=None,
                 clipboard=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetFound
        from datalad.interface.results import get_status_dict

        ds = None
        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='reporting')
        except NoDatasetFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # warn that the dataset is bogus
            yield dict(
                action='wtf',
                path=ds.path,
                status='impossible',
                message=('No dataset found at %s. Reporting on the dataset is '
                         'not attempted.', ds.path),
                logger=lgr)
            # we don't deal with absent datasets
            ds = None
        if sensitive:
            if ds is None:
                from datalad import cfg
            else:
                cfg = ds.config
        else:
            cfg = None

        from datalad.ui import ui
        from datalad.support.external_versions import external_versions

        infos = OrderedDict()
        res = get_status_dict(
            action='wtf',
            path=ds.path if ds else assure_unicode(op.abspath(op.curdir)),
            type='dataset' if ds else 'directory',
            status='ok',
            logger=lgr,
            decor=decor,
            infos=infos,
        )

        # Define section callables which require variables.
        # so there is no side-effect on module level original
        section_callables = SECTION_CALLABLES.copy()
        section_callables['location'] = partial(_describe_location, res)
        section_callables['configuration'] = \
            partial(_describe_configuration, cfg, sensitive)
        if ds:
            section_callables['dataset'] = \
                partial(_describe_dataset, ds, sensitive)
        else:
            section_callables.pop('dataset')
        assert all(section_callables.values())  # check if none was missed

        if sections is None:
            sections = sorted(list(section_callables))

        for s in sections:
            infos[s] = section_callables[s]()

        if clipboard:
            external_versions.check(
                'pyperclip', msg="It is needed to be able to use clipboard")
            import pyperclip
            report = _render_report(res)
            pyperclip.copy(report)
            ui.message("WTF information of length %s copied to clipboard" %
                       len(report))
        yield res
        return
Exemple #40
0
    def show_keys(self, mode=None):
        maxl = 100  # maximal line length for unique values in mode=short

        # use a dict already, later we need to map to a definition
        # meanwhile map to the values

        class key_stat:
            def __init__(self):
                self.ndatasets = 0  # how many datasets have this field
                self.uvals = set()

        from collections import defaultdict
        keys = defaultdict(key_stat)

        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # inject a few basic properties into the dict
            # analog to what the other modes do in their index
            meta.update({
                k: res.get(k, None)
                for k in ('@id', 'type', 'path', 'parentds')
                # parentds is tricky all files will have it, but the dataset
                # queried above might not (single dataset), let's force it in
                if k == 'parentds' or k in res
            })

            # no stringification of values for speed
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k, kvals in iteritems(idxd):
                # TODO deal with conflicting definitions when available
                keys[k].ndatasets += 1
                if mode == 'name':
                    continue
                try:
                    kvals_set = assure_iter(kvals, set)
                except TypeError:
                    # TODO: may be do show hashable ones???
                    nunhashable = sum(
                        isinstance(x, collections.Hashable) for x in kvals)
                    kvals_set = {
                        'unhashable %d out of %d entries' %
                        (nunhashable, len(kvals))
                    }
                keys[k].uvals |= kvals_set

        for k in sorted(keys):
            if mode == 'name':
                print(k)
                continue

            # do a bit more
            stat = keys[k]
            uvals = stat.uvals
            if mode == 'short':
                # show only up to X uvals
                if len(stat.uvals) > 10:
                    uvals = {v for i, v in enumerate(uvals) if i < 10}
            # all unicode still scares yoh -- he will just use repr
            # def conv(s):
            #     try:
            #         return '{}'.format(s)
            #     except UnicodeEncodeError:
            #         return assure_unicode(s).encode('utf-8')
            stat.uvals_str = assure_unicode("{} unique values: {}".format(
                len(stat.uvals), ', '.join(map(repr, uvals))))
            if mode == 'short':
                if len(stat.uvals) > 10:
                    stat.uvals_str += ', ...'
                if len(stat.uvals_str) > maxl:
                    stat.uvals_str = stat.uvals_str[:maxl - 4] + ' ....'
            elif mode == 'full':
                pass
            else:
                raise ValueError(
                    "Unknown value for stats. Know full and short")

            print('{k}\n in  {stat.ndatasets} datasets\n has {stat.uvals_str}'.
                  format(k=k, stat=stat))
Exemple #41
0
    def _mk_search_index(self, force_reindex):
        """Generic entrypoint to index generation

        The actual work that determines the structure and content of the index
        is done by functions that are passed in as arguments

        `meta2doc` - must return dict for index document from result input
        """
        from whoosh import index as widx
        from .metadata import agginfo_relpath
        # what is the lastest state of aggregated metadata
        metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath)
        # use location common to all index types, they would all invalidate
        # simultaneously
        stamp_fname = opj(self.index_dir, 'datalad_metadata_state')
        index_dir = opj(self.index_dir, self._mode_label)

        if (not force_reindex) and \
                exists(index_dir) and \
                exists(stamp_fname) and \
                open(stamp_fname).read() == metadata_state:
            try:
                # TODO check that the index schema is the same
                # as the one we would have used for reindexing
                # TODO support incremental re-indexing, whoosh can do it
                idx = widx.open_dir(index_dir)
                lgr.debug('Search index contains %i documents',
                          idx.doc_count())
                self.idx_obj = idx
                return
            except widx.LockError as e:
                raise e
            except widx.IndexError as e:
                # Generic index error.
                # we try to regenerate
                lgr.warning(
                    "Cannot open existing index %s (%s), will regenerate",
                    index_dir, exc_str(e))
            except widx.IndexVersionError as e:  # (msg, version, release=None)
                # Raised when you try to open an index using a format that the
                # current version of Whoosh cannot read. That is, when the index
                # you're trying to open is either not backward or forward
                # compatible with this version of Whoosh.
                # we try to regenerate
                lgr.warning(exc_str(e))
                pass
            except widx.OutOfDateError as e:
                # Raised when you try to commit changes to an index which is not
                # the latest generation.
                # this should not happen here, but if it does ... KABOOM
                raise
            except widx.EmptyIndexError as e:
                # Raised when you try to work with an index that has no indexed
                # terms.
                # we can just continue with generating an index
                pass
            except ValueError as e:
                if 'unsupported pickle protocol' in str(e):
                    lgr.warning(
                        "Cannot open existing index %s (%s), will regenerate",
                        index_dir, exc_str(e))
                else:
                    raise

        lgr.info('{} search index'.format(
            'Rebuilding' if exists(index_dir) else 'Building'))

        if not exists(index_dir):
            os.makedirs(index_dir)

        # this is a pretty cheap call that just pull this info from a file
        dsinfo = self.ds.metadata(get_aggregates=True,
                                  return_type='list',
                                  result_renderer='disabled')

        self._mk_schema(dsinfo)

        idx_obj = widx.create_in(index_dir, self.schema)
        idx = idx_obj.writer(
            # cache size per process
            limitmb=cfg.obtain('datalad.search.indexercachesize'),
            # disable parallel indexing for now till #1927 is resolved
            ## number of processes for indexing
            #procs=multiprocessing.cpu_count(),
            ## write separate index segments in each process for speed
            ## asks for writer.commit(optimize=True)
            #multisegment=True,
        )

        # load metadata of the base dataset and what it knows about all its subdatasets
        # (recursively)
        old_idx_size = 0
        old_ds_rpath = ''
        idx_size = 0
        log_progress(
            lgr.info,
            'autofieldidxbuild',
            'Start building search index',
            total=len(dsinfo),
            label='Building search index',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                reporton=self.documenttype,
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                # MIH: I cannot see a case when we would not want recursion (within
                # the metadata)
                recursive=True):
            # this assumes that files are reported after each dataset report,
            # and after a subsequent dataset report no files for the previous
            # dataset will be reported again
            meta = res.get('metadata', {})
            doc = self._meta2doc(meta)
            admin = {
                'type': res['type'],
                'path': relpath(res['path'], start=self.ds.path),
            }
            if 'parentds' in res:
                admin['parentds'] = relpath(res['parentds'],
                                            start=self.ds.path)
            if admin['type'] == 'dataset':
                if old_ds_rpath:
                    lgr.debug(
                        'Added %s on dataset %s',
                        single_or_plural('document',
                                         'documents',
                                         idx_size - old_idx_size,
                                         include_count=True), old_ds_rpath)
                log_progress(lgr.info,
                             'autofieldidxbuild',
                             'Indexed dataset at %s',
                             old_ds_rpath,
                             update=1,
                             increment=True)
                old_idx_size = idx_size
                old_ds_rpath = admin['path']
                admin['id'] = res.get('dsid', None)

            doc.update({k: assure_unicode(v) for k, v in admin.items()})
            lgr.debug("Adding document to search index: {}".format(doc))
            # inject into index
            idx.add_document(**doc)
            idx_size += 1

        if old_ds_rpath:
            lgr.debug(
                'Added %s on dataset %s',
                single_or_plural('document',
                                 'documents',
                                 idx_size - old_idx_size,
                                 include_count=True), old_ds_rpath)

        lgr.debug("Committing index")
        idx.commit(optimize=True)
        log_progress(lgr.info, 'autofieldidxbuild',
                     'Done building search index')

        # "timestamp" the search index to allow for automatic invalidation
        with open(stamp_fname, 'w') as f:
            f.write(metadata_state)

        lgr.info('Search index contains %i documents', idx_size)
        self.idx_obj = idx_obj
Exemple #42
0
def _describe_datalad():

    return {
        'version': assure_unicode(__version__),
        'full_version': assure_unicode(__full_version__),
    }
Exemple #43
0
    def __call__(
            path=None,
            dataset=None,
            # support passing this through in a path by path basis
            to_git=None,
            save=True,
            message=None,
            message_file=None,
            recursive=False,
            recursion_limit=None,
            ds2super=False,
            git_opts=None,
            annex_opts=None,
            annex_add_opts=None,
            jobs=None):
        # parameter constraints:
        if not path:
            raise InsufficientArgumentsError(
                "insufficient information for adding: requires at least a path")
        refds_path = Interface.get_refds_path(dataset)
        common_report = dict(action='add', logger=lgr, refds=refds_path)

        if message and message_file:
            raise ValueError("Both a message and message file were specified")

        if message_file:
            with open(message_file, "rb") as mfh:
                message = assure_unicode(mfh.read())

        to_add = []
        subds_to_add = {}
        ds_to_annotate_from_recursion = {}
        got_nothing = True
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=dataset,
                # never recursion, need to handle manually below to be able to
                # discover untracked content
                recursive=False,
                action='add',
                # speed things up by using Git's modification detection, if there
                # is a repo with at least one commit
                modified='HEAD' \
                if dataset and \
                GitRepo.is_valid_repo(refds_path) and \
                GitRepo(refds_path).get_hexsha() \
                else None,
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist: %s",
                nondataset_path_status='impossible',
                return_type='generator',
                on_failure='ignore'):
            got_nothing = False
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset':
                yield get_status_dict(
                    status='impossible',
                    message='"there is no dataset to add this path to',
                    **dict(common_report, **ap))
                continue
            if ap.get('type', None) == 'directory' and \
                    ap.get('state', None) == 'untracked' and \
                    GitRepo.is_valid_repo(ap['path']):
                # this is an untracked wannabe subdataset in disguise
                ap['type'] = 'dataset'
            if recursive and \
                    (ap.get('raw_input', False) or
                     ap.get('state', None) in ('added', 'modified', 'untracked')) and \
                    (ap.get('parentds', None) or ap.get('type', None) == 'dataset'):
                # this was an actually requested input path, or a path that was found
                # modified by path annotation, based on an input argument
                # we need to recurse into all subdirs to find potentially
                # unregistered subdatasets
                # but only if this path has a parent, or is itself a dataset
                # otherwise there is nothing to add to
                _discover_subdatasets_recursively(
                    ds_to_annotate_from_recursion,
                    ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']],
                    recursion_limit)
                # get the file content of the root dataset of this search added too
                # but be careful with extreme recursion_limit settings
                if recursion_limit is None or recursion_limit > 0:
                    ap['process_content'] = True
            # record for further processing
            if not ap['path'] in ds_to_annotate_from_recursion:
                # if it was somehow already discovered
                to_add.append(ap)
        if got_nothing:
            # path annotation yielded nothing, most likely cause is that nothing
            # was found modified, we need to say something about the reference
            # dataset
            yield get_status_dict(
                'add',
                status='notneeded',
                path=refds_path,
                type='dataset',
                logger=lgr)
            return

        for subds in ds_to_annotate_from_recursion:
            if subds not in subds_to_add:
                # always prefer the already annotated path
                subds_to_add[subds] = ds_to_annotate_from_recursion[subds]

        if dataset:
            # we have a base dataset, discover any intermediate datasets between
            # the base and any already discovered dataset
            discovered = {}
            discover_dataset_trace_to_targets(
                # from here
                dataset.path,
                # to any dataset we are aware of
                subds_to_add.keys(),
                [],
                discovered)
            for parentds in discovered:
                for subds in discovered[parentds]:
                    subds_to_add[subds] = subds_to_add.get(
                        subds,
                        dict(path=subds, parentds=parentds, type='dataset'))

        # merge custom paths and discovered dataset records, paths needs to go first,
        # because we know most about then, and subsequent annotation call we skip the
        # later duplicate ones
        to_add.extend(subds_to_add.values())
        # and compact, this should be OK as all the info is in each ap dict
        to_add = unique(to_add, lambda x: x['path'])

        if not to_add:
            # nothing left to do, potentially all errored before
            return

        # now re-annotate all paths, this will be fast for already annotated ones
        # and will amend the annotation for others, it will also deduplicate
        annotated_paths = AnnotatePaths.__call__(
            path=to_add,
            dataset=dataset,
            # never recursion, done already
            recursive=False,
            action='add',
            unavailable_path_status='impossible',
            unavailable_path_msg="path does not exist: %s",
            nondataset_path_status='impossible',
            return_type='generator',
            # if there is an error now, we made this mistake in here
            on_failure='stop')

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                annotated_paths,
                refds_path=refds_path)
        assert(not completed)

        if not content_by_ds:
            # we should have complained about any inappropriate path argument
            # above, so if nothing is left, we can simply exit
            return

        # simple loop over datasets -- save happens later
        # start deep down
        to_save = []
        for ds_path in sorted(content_by_ds, reverse=True):
            ds = Dataset(ds_path)
            torepoadd = {}
            respath_by_status = {}
            for ap in content_by_ds[ds_path]:
                # we have a new story
                ap.pop('status', None)
                torepoadd[ap['path']] = ap

                # skip anything that doesn't look like a wannabe subdataset
                if not ap.get('type', None) == 'dataset' or \
                        ap['path'] == ds_path:
                    continue

                if ap.get('registered_subds', False):
                    # subdataset that might be in this list because of the
                    # need to save all the way up to a super dataset
                    respath_by_status['success'] = \
                        respath_by_status.get('success', []) + [ap['path']]
                    yield get_status_dict(
                        status='notneeded',
                        message="already known subdataset",
                        **dict(common_report, **ap))
                    continue
                subds = Dataset(ap['path'])
                subds_relpath = relpath(ap['path'], ds_path)
                # Register the repository in the repo tree as a submodule
                try:
                    ds.repo.add_submodule(subds_relpath, url=None, name=None)
                except (CommandError, InvalidGitRepositoryError) as e:
                    yield get_status_dict(
                        ds=subds, status='error', message=e.stderr,
                        **dict(common_report, **ap))
                    continue
                # queue for saving using the updated annotated path
                ap['registered_subds'] = True
                # I hope this is true in direct mode too
                # TODO this is disabled, because in some circumstances
                # staging just doesn't happen, and it is unclear when
                # exactly -- the case that prompted disabling was a submodule
                # that had no content except for other submodules was not staged,
                # whereas another submodule on the same level in the same
                # superdataset which also has one file in it was staged
                # disable to work correctly, while paying a little bit of
                # slow down
                #ap['staged'] = True
                to_save.append(ap)
                # report added subdatasets -- `annex add` below won't do it
                yield get_status_dict(
                    ds=subds,
                    status='ok',
                    message='added new subdataset',
                    **dict(common_report, **ap))
                # make sure that .gitmodules is added to the list of files
                gitmodules_path = opj(ds.path, '.gitmodules')
                # for git
                torepoadd[gitmodules_path] = dict(path=gitmodules_path)
                # and for save
                to_save.append(dict(
                    path=gitmodules_path,
                    parentds=ds_path,
                    type='file'))
            # make sure any last minute additions make it to the saving stage
            # XXX? should content_by_ds become OrderedDict so that possible
            # super here gets processed last?
            lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd)
            is_annex = isinstance(ds.repo, AnnexRepo)
            add_kw = {'jobs': jobs} if is_annex and jobs else {}
            added = ds.repo.add_(
                list(torepoadd.keys()),
                git=to_git if is_annex else True,
                **add_kw
            )
            for a in added:
                res = annexjson2result(a, ds, type='file', **common_report)
                success = success_status_map[res['status']]
                respath_by_status[success] = \
                    respath_by_status.get(success, []) + [res['path']]
                # produce best possible path/result annotation
                if res['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    res = dict(torepoadd[res['path']], **res)
                # override this in all cases to be safe
                res['parentds'] = ds.path
                if success:
                    # this was successfully added, queue for saving this very path
                    # in the dataset
                    ap = {k: v for k, v in res.items() if k != 'status'}
                    ap['staged'] = True
                    # strip any status and state info (e.g. save will refuse to save
                    # stuff that is marked state='untracked'
                    to_save.append({k: v for k, v in res.items()
                                    if k not in ('status', 'state')})
                if a['file'] == '.gitmodules':
                    # filter out .gitmodules, because this is only included for
                    # technical reasons and has nothing to do with the actual content
                    continue
                if GitRepo.is_valid_repo(res['path']):
                    # more accurate report in case of an added submodule
                    # mountpoint.
                    # XXX Actually not sure if this can really happen
                    # (depends on what our low-level code would do)
                    # but worst case is that we loose a little bit of
                    # coverage...
                    res['type'] = 'dataset'
                    res['message'] = 'added new state as submodule'
                yield res

            for r in results_from_annex_noinfo(
                    ds, torepoadd, respath_by_status,
                    dir_fail_msg='could not add some content in %s %s',
                    noinfo_dir_msg='nothing to add from %s',
                    noinfo_file_msg='already included in the dataset',
                    action='add',
                    logger=lgr,
                    refds=refds_path):
                if r['path'] in torepoadd:
                    # pull out correct ap for any path that comes out here
                    # (that we know things about), and use the original annotation
                    # instead of just the annex report
                    r = dict(r, **torepoadd[r['path']])

                if r['status'] == 'notneeded':
                    # this could be a file that was staged already, it doesn't need
                    # to be added, but it should be saved/commited if so desired
                    to_save.append({k: v for k, v in r.items()
                                    if k not in ('status', 'state')})

                # XXX something is fishy with the next one, rethink when sober....
                if r['path'] == ds_path and r['status'] == 'ok':
                    # this is for the entire dataset itself which was explicitly requested
                    # make sure to save all
                    r['type'] = 'dataset'
                    r['process_content'] = True
                    to_save.append({k: v for k, v in r.items() if k != 'status'})
                yield r
            if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])):
                # TODO XXX we have an issue here when with `add('.')` and annex ignores any
                # dotfiles. In this case we end up not saving a dataset completely, because
                # we rely on accurate reporting. there is an issue about this already
                # TODO look up the issue ID
                # if there is a base dataset, but we are below it, and we have anything done to this
                # dataset -> queue dataset itself for saving its state in the parent
                ds_ap = dict(
                    path=ds.path,
                    # we have to look for the parent here, as we must save the
                    # subdataset in the parent and not the whole subdataset itself
                    type='dataset')
                parentds = get_dataset_root(normpath(opj(ds.path, pardir)))
                if parentds:
                    ds_ap['parentds'] = parentds
                if dataset:
                    ds_ap['refds'] = refds_path
                to_save.append(ds_ap)

        if not save:
            lgr.debug('Not calling `save` as instructed')
            return

        # TODO tell save what was staged already! Set 'staged=True' for
        # respective annotated paths that are fed into `save`

        # do not reuse any of the sorting done in here for saving, but instead
        # pass on all the annotated paths to have `save` figure out what to do with
        # them -- this is costs something, but should be safer, and frankly is
        # more comprehensible
        for res in Save.__call__(
                # hand-selected annotated paths
                path=to_save,
                dataset=refds_path,
                message=message if message else '[DATALAD] added content',
                return_type='generator',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Exemple #44
0
def test_aggregation(path):
    with chpwd(path):
        assert_raises(InsufficientArgumentsError, aggregate_metadata, None)
    # a hierarchy of three (super/sub)datasets, each with some native metadata
    ds = Dataset(opj(path, 'origin')).create(force=True)
    # before anything aggregated we would get nothing and only a log warning
    with swallow_logs(new_level=logging.WARNING) as cml:
        assert_equal(list(query_aggregated_metadata('all', ds, [])), [])
    assert_re_in('.*Found no aggregated metadata.*update', cml.out)
    ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                  where='dataset')
    subds = ds.create('sub', force=True)
    subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                     where='dataset')
    subsubds = subds.create('subsub', force=True)
    subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage',
                        where='dataset')
    ds.add('.', recursive=True)
    ok_clean_git(ds.path)
    # aggregate metadata from all subdatasets into any superdataset, including
    # intermediate ones
    res = ds.aggregate_metadata(recursive=True, update_mode='all')
    # we get success report for both subdatasets and the superdataset,
    # and they get saved
    assert_result_count(res, 6)
    assert_result_count(res, 3, status='ok', action='aggregate_metadata')
    assert_result_count(res, 3, status='ok', action='save')
    # nice and tidy
    ok_clean_git(ds.path)

    # quick test of aggregate report
    aggs = ds.metadata(get_aggregates=True)
    # one for each dataset
    assert_result_count(aggs, 3)
    # mother also report layout version
    assert_result_count(aggs, 1, path=ds.path, layout_version=1)

    # store clean direct result
    origres = ds.metadata(recursive=True)
    # basic sanity check
    assert_result_count(origres, 6)
    assert_result_count(origres, 3, type='dataset')
    assert_result_count(origres, 3, type='file')  # Now that we have annex.key
    # three different IDs
    assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset'])))
    # and we know about all three datasets
    for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'):
        assert_true(
            sum([s['metadata']['frictionless_datapackage']['name'] \
                    == assure_unicode(name) for s in origres
                 if s['type'] == 'dataset']))

    # now clone the beast to simulate a new user installing an empty dataset
    clone = install(
        opj(path, 'clone'), source=ds.path,
        result_xfm='datasets', return_type='item-or-list')
    # ID mechanism works
    assert_equal(ds.id, clone.id)

    # get fresh metadata
    cloneres = clone.metadata()
    # basic sanity check
    assert_result_count(cloneres, 2)
    assert_result_count(cloneres, 1, type='dataset')
    assert_result_count(cloneres, 1, type='file')

    # now loop over the previous results from the direct metadata query of
    # origin and make sure we get the extact same stuff from the clone
    _compare_metadata_helper(origres, clone)

    # now obtain a subdataset in the clone, should make no difference
    assert_status('ok', clone.install('sub', result_xfm=None, return_type='list'))
    _compare_metadata_helper(origres, clone)

    # test search in search tests, not all over the place
    ## query smoke test
    assert_result_count(clone.search('mother', mode='egrep'), 1)
    assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1)

    child_res = clone.search('child', mode='egrep')
    assert_result_count(child_res, 2)
    for r in child_res:
        if r['type'] == 'dataset':
            assert_in(
                r['query_matched']['frictionless_datapackage.name'],
                r['metadata']['frictionless_datapackage']['name'])