def _describe_datalad(): from datalad.version import __version__, __full_version__ return { 'version': assure_unicode(__version__), 'full_version': assure_unicode(__full_version__), }
def yield_participant_info(bids): for bidsvars in bids.get_collections(level='dataset')[0].to_df().to_dict( orient='records'): props = dict(id=assure_unicode(bidsvars.pop('subject'))) for p in bidsvars: # take away some ambiguity normk = assure_unicode(p).lower() hk = content_metakey_map.get(normk, normk) val = assure_unicode(bidsvars[p]) if hk in ('sex', 'gender'): if hasattr(val, 'lower'): val = val.lower() elif isinstance(val, float) and isnan(val): # pybids reports 'n/a' is NaN val = 'n/a' val = sex_label_map.get(val, val) if hk == 'suffix' and val == 'participants': # regression in PyBIDS 0.7.1, should be fixed in 0.8 # https://github.com/bids-standard/pybids/issues/380 # TODO: remove workaround whenever we depend on pybids >= 0.8 # after verifying that it is not succeptable continue if val: props[hk] = val if props: yield re.compile(r'^sub-{}/.*'.format(props['id'])), props
def _add_document(idx, **kwargs): idx.add_document( **{ assure_unicode(k): assure_unicode(v) if isinstance(v, unicode_srctypes) else v for k, v in kwargs.items() })
def __call__(self, query, max_nresults=None, force_reindex=False): with self.idx_obj.searcher() as searcher: wquery = self.get_query(query) # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults) ) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format( ('up to ' + topstr) if max_nresults > 0 else 'all matches' ) if not hits.is_empty() else ' No matches.' )) if not hits: return nhits = 0 # annotate hits for full metadata report hits = [dict( path=normpath(opj(self.ds.path, hit['path'])), query_matched={assure_unicode(k): assure_unicode(v) if isinstance(v, unicode_srctypes) else v for k, v in hit.matched_terms()}, parentds=normpath( opj(self.ds.path, hit['parentds'])) if 'parentds' in hit else None, type=hit.get('type', None)) for hit in hits] for res in query_aggregated_metadata( # type is taken from hit record reporton=None, ds=self.ds, aps=hits, # never recursive, we have direct hits already recursive=False): res.update( refds=self.ds.path, action='search', status='ok', logger=lgr, ) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info( "Reached the limit of {}, there could be more which " "were not reported.".format(topstr) )
def _describe_datalad(): from datalad.version import __version__, __full_version__ return { 'version': assure_unicode(__version__), 'full_version': assure_unicode(__full_version__), }
def custom_result_renderer(res, **kwargs): # pragma: no cover if not (res['status'] == 'ok' \ and res['action'] in ('status', 'diff') \ and res.get('state', None) != 'clean'): # logging reported already return from datalad.ui import ui # when to render relative paths: # 1) if a dataset arg was given # 2) if CWD is the refds refds = res.get('refds', None) refds = refds if kwargs.get('dataset', None) is not None \ or refds == os.getcwd() else None # Note: We have to force unicode for res['path'] because # interface.utils encodes it on py2 before passing it to # custom_result_renderer(). path = assure_unicode(res['path']) if refds is None \ else text_type(ut.Path(res['path']).relative_to(refds)) type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked') state = res.get('state', 'unknown') ui.message(u'{fill}{state}: {path}{type_}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))), path=path, type_=' ({})'.format( ac.color_word(type_, ac.MAGENTA) if type_ else '')))
def normalize_command(command): """Convert `command` to the string representation. """ if isinstance(command, list): command = list(map(assure_unicode, command)) if len(command) == 1 and command[0] != "--": # This is either a quoted compound shell command or a simple # one-item command. Pass it as is. # # FIXME: This covers the predominant command-line case, but, for # Python API callers, it means values like ["./script with spaces"] # requires additional string-like escaping, which is inconsistent # with the handling of multi-item lists (and subprocess's # handling). Once we have a way to detect "running from Python API" # (discussed in gh-2986), update this. command = command[0] else: if command and command[0] == "--": # Strip disambiguation marker. Note: "running from Python API" # FIXME from below applies to this too. command = command[1:] command = " ".join(shlex_quote(c) for c in command) else: command = assure_unicode(command) return command
def custom_result_renderer(res, **kwargs): # pragma: no cover if not (res['status'] == 'ok' \ and res['action'] in ('status', 'diff') \ and res.get('state', None) != 'clean'): # logging reported already return from datalad.ui import ui # when to render relative paths: # 1) if a dataset arg was given # 2) if CWD is the refds refds = res.get('refds', None) refds = refds if kwargs.get('dataset', None) is not None \ or refds == os.getcwd() else None # Note: We have to force unicode for res['path'] because # interface.utils encodes it on py2 before passing it to # custom_result_renderer(). path = assure_unicode(res['path']) if refds is None \ else text_type(ut.Path(res['path']).relative_to(refds)) type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked') state = res.get('state', 'unknown') ui.message(u'{fill}{state}: {path}{type_}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))), path=path, type_=' ({})'.format( ac.color_word(type_, ac.MAGENTA) if type_ else '')))
def normalize_command(command): """Convert `command` to the string representation. """ if isinstance(command, list): command = list(map(assure_unicode, command)) if len(command) == 1 and command[0] != "--": # This is either a quoted compound shell command or a simple # one-item command. Pass it as is. # # FIXME: This covers the predominant command-line case, but, for # Python API callers, it means values like ["./script with spaces"] # requires additional string-like escaping, which is inconsistent # with the handling of multi-item lists (and subprocess's # handling). Once we have a way to detect "running from Python API" # (discussed in gh-2986), update this. command = command[0] else: if command and command[0] == "--": # Strip disambiguation marker. Note: "running from Python API" # FIXME from below applies to this too. command = command[1:] command = join_cmdline(command) else: command = assure_unicode(command) return command
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True, if_dirty='ignore') subds.repo.remove(opj('.datalad', 'config')) subds.save() assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True, if_dirty='ignore') # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東', ): assert_true( sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subds.remove(opj('.datalad', 'config'), if_dirty='ignore') assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東',): assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def _get_dsmeta(self, bids): context = {} meta = {self._key2stdkey.get(k, k): v for k, v in bids.get_metadata( opj(self.ds.path, self._dsdescr_fname)).items()} # TODO maybe normalize labels of standard licenses to definition URIs # perform mapping README_fname = opj(self.ds.path, 'README') if not meta.get('description') and exists(README_fname): # BIDS uses README to provide description, so if was not # explicitly provided to possibly override longer README, let's just # load README with open(README_fname, 'rb') as f: desc = assure_unicode(f.read()) meta['description'] = desc.strip() # special case # Could be None which we can't strip so or '' bids_version = (meta.get('BIDSVersion', '') or '').strip() bids_defurl = 'http://bids.neuroimaging.io' if bids_version: bids_defurl += '/bids_spec{}.pdf'.format(bids_version) meta['conformsto'] = bids_defurl context['bids'] = { # not really a working URL, but BIDS doesn't provide term defs in # any accessible way '@id': '{}#'.format(bids_defurl), 'description': 'ad-hoc vocabulary for the Brain Imaging Data Structure (BIDS) standard', 'type': vocabulary_id, } context.update(vocabulary) meta['@context'] = context return meta
def __call__(path=None, fr='HEAD', to=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None): yield from diff_dataset(dataset=dataset, fr=assure_unicode(fr), to=assure_unicode(to), constant_refs=False, path=path, annex=annex, untracked=untracked, recursive=recursive, recursion_limit=recursion_limit)
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] for f in self.paths: info = file_to_dict(opj(self.ds.path, f)) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info} # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actuall array values later continue # normalize value val = assure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()} contentmeta.append((f, meta)) return { '@context': context, }, \ contentmeta
def __str__(self): from datalad.utils import assure_unicode to_str = "%s: " % self.__class__.__name__ if self.cmd: to_str += "command '%s'" % (self.cmd, ) if self.code: to_str += " failed with exitcode %d" % self.code to_str += "\n%s" % assure_unicode(self.msg) return to_str
def __str__(self): from datalad.utils import assure_unicode to_str = "%s: " % self.__class__.__name__ if self.cmd: to_str += "command '%s'" % (self.cmd,) if self.code: to_str += " failed with exitcode %d" % self.code to_str += "\n%s" % assure_unicode(self.msg) return to_str
def test__version__(): # in released stage, version in the last CHANGELOG entry # should correspond to the one in datalad CHANGELOG_filename = op.join( op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md') if not op.exists(CHANGELOG_filename): raise SkipTest("no %s found" % CHANGELOG_filename) regex = re.compile(r'^## ' r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+' r'\((?P<date>.*)\)' r'\s+--\s+' r'(?P<codename>.+)' ) with open(CHANGELOG_filename, 'rb') as f: for line in f: line = line.rstrip() if not line.startswith(b'## '): # The first section header we hit, must be our changelog entry continue reg = regex.match(assure_unicode(line)) if not reg: # first one at that level is the one raise AssertionError( "Following line must have matched our regex: %r" % line) regd = reg.groupdict() changelog_version = regd['version'] lv_changelog_version = LooseVersion(changelog_version) # we might have a suffix - sanitize san__version__ = __version__.rstrip('.devdirty') lv__version__ = LooseVersion(san__version__) if '???' in regd['date'] and 'will be better than ever' in regd['codename']: # we only have our template # we can only assert that its version should be higher than # the one we have now assert_greater(lv_changelog_version, lv__version__) else: # should be a "release" record assert_not_in('???', regd['date']) assert_not_in('will be better than ever', regd['codename']) assert_equal(__hardcoded_version__, changelog_version) if __hardcoded_version__ != san__version__: # It was not tagged yet and Changelog should have its # template record for the next release assert_greater(lv_changelog_version, lv__version__) assert_in('.dev', san__version__) else: # all is good, tagged etc assert_equal(lv_changelog_version, lv__version__) assert_equal(changelog_version, san__version__) assert_equal(__hardcoded_version__, san__version__) return raise AssertionError( "No log line matching our regex found in %s" % CHANGELOG_filename )
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = [ 'git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked) ] try: stdout, stderr = GitRunner(cwd=dspath).run(cmd, log_stderr=True, log_stdout=True, log_online=False, expect_stderr=False, shell=False, expect_fail=True) except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None from datalad.utils import assure_unicode for line in stdout.split('\0'): if not line: continue line = assure_unicode(line) if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any(norm_apath == p or path_startswith(apath, p) for p in paths): # we got a whitelist for paths, don't report any other continue ap = dict(path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = ['git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked)] try: stdout, stderr = GitRunner(cwd=dspath).run( cmd, log_stderr=True, log_stdout=True, log_online=False, expect_stderr=False, shell=False, expect_fail=True) except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None from datalad.utils import assure_unicode for line in stdout.split('\0'): if not line: continue line = assure_unicode(line) if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any(norm_apath == p or path_startswith(apath, p) for p in paths): # we got a whitelist for paths, don't report any other continue ap = dict( path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def show_keys(self, mode=None): maxl = 100 # maximal line length for unique values in mode=short # use a dict already, later we need to map to a definition # meanwhile map to the values keys = self._get_keys(mode) for k in sorted(keys): if mode == 'name': print(k) continue # do a bit more stat = keys[k] uvals = stat.uvals if mode == 'short': # show only up to X uvals if len(stat.uvals) > 10: uvals = {v for i, v in enumerate(uvals) if i < 10} # all unicode still scares yoh -- he will just use repr # def conv(s): # try: # return '{}'.format(s) # except UnicodeEncodeError: # return assure_unicode(s).encode('utf-8') stat.uvals_str = assure_unicode( "{} unique values: {}".format( len(stat.uvals), ', '.join(map(repr, uvals)))) if mode == 'short': if len(stat.uvals) > 10: stat.uvals_str += ', ...' if len(stat.uvals_str) > maxl: stat.uvals_str = stat.uvals_str[:maxl-4] + ' ....' elif mode == 'full': pass else: raise ValueError( "Unknown value for stats. Know full and short") print( u'{k}\n in {stat.ndatasets} datasets\n has {stat.uvals_str}'.format( k=k, stat=stat )) # After #2156 datasets may not necessarily carry all # keys in the "unique" summary lgr.warning('In this search mode, the reported list of metadata keys may be incomplete')
def test_get_open_files(p): pobj = Path(p) skip_if_no_module('psutil') eq_(get_open_files(p), {}) f1 = pobj / '1' subd = pobj / 'd' with f1.open() as f: # since lsof does not care about PWD env var etc, paths # will not contain symlinks, we better realpath them # all before comparison eq_(get_open_files(p, log_open=40)[str(f1.resolve())].pid, os.getpid()) assert not get_open_files(str(subd)) if on_windows: # the remainder of the test assume a certain performance. # however, on windows get_open_files() can be very slow # (e.g. the first invocation in this test (above) can easily # take 30-50s). It is not worth slowing the tests to # accomodate this issue, given we have tested proper functioning # in principle already above). return # if we start a process within that directory, should get informed from subprocess import Popen, PIPE from time import time t0 = time() proc = Popen([ sys.executable, '-c', r'import sys; sys.stdout.write("OK\n"); sys.stdout.flush();' r'import time; time.sleep(10)' ], stdout=PIPE, cwd=str(subd)) # Assure that it started and we read the OK eq_(assure_unicode(proc.stdout.readline().strip()), u"OK") assert time( ) - t0 < 5 # that we were not stuck waiting for process to finish eq_(get_open_files(p)[str(subd.resolve())].pid, proc.pid) eq_(get_open_files(subd)[str(subd.resolve())].pid, proc.pid) proc.terminate() assert_equal(get_open_files(str(subd)), {})
def test_inputs_quotes_needed(path): ds = Dataset(path).create(force=True) ds.add(".") cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))" # The string form of a command works fine when the inputs/outputs have # spaces ... cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format( sys.executable, cmd) ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs") expected = u"!".join( list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) + ["out0"]) with open(op.join(path, "out0")) as ifh: eq_(assure_unicode(ifh.read()), expected) # ... but the list form of a command does not. (Don't test this failure # with the obscure file name because we'd need to know its composition to # predict the failure.) cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"] ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"]) ok_file_has_content(opj(path, "out0"), "bar.txt foo!blah.txt!out0")
def test_inputs_quotes_needed(path): ds = Dataset(path).create(force=True) ds.save() cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))" # The string form of a command works fine when the inputs/outputs have # spaces ... cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format( sys.executable, cmd) ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs") expected = u"!".join( list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) + ["out0"]) with open(op.join(path, "out0")) as ifh: eq_(assure_unicode(ifh.read()), expected) # ... but the list form of a command does not. (Don't test this failure # with the obscure file name because we'd need to know its composition to # predict the failure.) cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"] ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"]) ok_file_has_content(op.join(path, "out0"), "bar.txt foo!blah.txt!out0")
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set( t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection( downward_targets)) undiscovered_ds = [t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union( undiscovered_ds)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal( 3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res
def test_aggregation(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(op.join(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') assert_status('ok', ds.save(recursive=True)) # while we are at it: dot it again, nothing should happen assert_status('notneeded', ds.save(recursive=True)) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.meta_aggregate(recursive=True, into='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='meta_aggregate') # the respective super datasets see two saves, one to record the change # in the subdataset after its own aggregation, and one after the super # updated with aggregated metadata assert_result_count(res, 5, status='ok', action='save', type='dataset') # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.meta_dump(reporton='aggregates', recursive=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.meta_dump(recursive=True) # basic sanity check assert_result_count(origres, 3, type='dataset') assert_result_count([r for r in origres if r['path'].endswith('.json')], 3, type='file') # Now that we have annex.key # three different IDs eq_( 3, len( set([ _get_dsid_from_core_metadata(s['metadata']['metalad_core']) for s in origres if s['type'] == 'dataset' ]))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install(op.join(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works eq_(ds.id, clone.id) # get fresh metadata cloneres = clone.meta_dump() # basic sanity check assert_result_count(cloneres, 1, type='dataset') # payload file assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in(r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of path that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set( t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection( downward_targets)) undiscovered_ds = [t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union( undiscovered_ds)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second natiev set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) assert_equal(len(meta), 10) # same schema assert_equal( 10, sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/' for s in meta])) # three different IDs assert_equal(3, len(set([s.get('@id') for s in meta]))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta])) #print(meta) assert_equal( # first implicit, then two natives, then aggregate meta[3]['dcterms:hasPart']['@id'], subds.id) success = False for m in meta: p = m.get('dcterms:hasPart', {}) if p.get('@id', None) == subsubds.id: assert_equal(opj('sub', 'subsub'), p.get('location', None)) success = True assert_true(success) # save the toplevel dataset only (see below) ds.save('with aggregated meta data', all_changes=True) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path) # ID mechanism works assert_equal(ds.id, clone.id) # get fresh meta data, the implicit one for the top-most datasets should # differ, but the rest not clonemeta = get_metadata( clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # make sure the implicit md for the topmost come first assert_equal(clonemeta[0]['@id'], clone.id) assert_equal(clonemeta[0]['@id'], ds.id) assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha()) assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha()) # all but the implicit is identical assert_equal(clonemeta[1:], meta[1:]) # the implicit md of the clone should list a dataset ID for its subds, # although it has not been obtained! assert_equal( clonemeta[3]['dcterms:hasPart']['@id'], subds.id) # now obtain a subdataset in the clone and the IDs should be updated clone.install('sub') partial = get_metadata(clone, guess_type=False, ignore_cache=True) # ids don't change assert_equal(partial[0]['@id'], clonemeta[0]['@id']) # datasets are properly connected assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id']) # query smoke test if os.environ.get('DATALAD_TESTS_NONETWORK'): raise SkipTest assert_equal(len(list(clone.search('mother'))), 1) assert_equal(len(list(clone.search('MoTHER'))), 1) # case insensitive child_res = list(clone.search('child')) assert_equal(len(child_res), 2) # little helper to match names def assert_names(res, names, path=clone.path): assert_equal(list(map(itemgetter(0), res)), [opj(path, n) for n in names]) # should yield (location, report) tuples assert_names(child_res, ['sub', 'sub/subsub']) # result should be identical to invoking search from api # and search_ should spit out locations out with swallow_outputs() as cmo: res = list(search_('child', dataset=clone)) assert_equal(res, child_res) assert_in(res[0][0], cmo.out) # and overarching search_ just for smoke testing of processing outputs # and not puking (e.g. under PY3) with swallow_outputs() as cmo: assert list(search_('.', regex=True, dataset=clone)) assert cmo.out # test searching among specified properties only assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub']) assert_names(clone.search('i', search='keywords'), ['.']) # case shouldn't matter assert_names(clone.search('i', search='Keywords'), ['.']) assert_names(clone.search('i', search=['name', 'keywords']), ['.', 'sub', 'sub/subsub']) # without report_matched, we are getting none of the fields assert(all([not x for x in map(itemgetter(1), child_res)])) # but we would get all if asking for '*' assert(all([len(x) >= 9 for x in map(itemgetter(1), list(clone.search('child', report='*')))])) # but we would get only the matching name if we ask for report_matched assert_equal( set(map(lambda x: tuple(x[1].keys()), clone.search('child', report_matched=True))), set([('name',)]) ) # and the additional field we might have asked with report assert_equal( set(map(lambda x: tuple(sorted(x[1].keys())), clone.search('child', report_matched=True, report=['schema:type']))), set([('name', 'schema:type')]) ) # and if we ask report to be 'empty', we should get no fields child_res_empty = list(clone.search('child', report='')) assert_equal(len(child_res_empty), 2) assert_equal( set(map(lambda x: tuple(x[1].keys()), child_res_empty)), set([tuple()]) ) # more tests on returned paths: assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub']) # if we clone subdataset and query for value present in it and its kid clone_sub = clone.install('sub') assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path) # Test 'and' for multiple search entries assert_equal(len(list(clone.search(['child', 'bids']))), 2) assert_equal(len(list(clone.search(['child', 'subsub']))), 1) assert_equal(len(list(clone.search(['bids', 'sub']))), 2) res = list(clone.search('.*', regex=True)) # with regex assert_equal(len(res), 3) # one per dataset # we do search, not match assert_equal(len(list(clone.search('randchild', regex=True))), 1) assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1) assert_equal(len(list(clone.search('randchil.', regex=True))), 1) assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0) assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1) assert_equal(len(list(clone.search('grandchild'))), 1)
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict(ds=subds, status='error', message=getattr(e, 'stderr', None) or text_type(e), **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] log_progress( lgr.info, 'extractorxmp', 'Start XMP metadata extraction from %s', self.ds, total=len(self.paths), label='XMP metadata extraction', unit=' Files', ) for f in self.paths: absfp = opj(self.ds.path, f) log_progress( lgr.info, 'extractorxmp', 'Extract XMP metadata from %s', absfp, update=1, increment=True) info = file_to_dict(absfp) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = {info[ns][0][0].split(':')[0]: {'@id': ns, 'type': vocabulary_id} for ns in info} # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actuall array values later continue # normalize value val = assure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items()} contentmeta.append((f, meta)) log_progress( lgr.info, 'extractorxmp', 'Finished XMP metadata extraction from %s', self.ds ) return { '@context': context, }, \ contentmeta
def test_assure_unicode(): ok_(isinstance(assure_unicode("m"), str)) ok_(isinstance(assure_unicode('grandchild_äöü東'), str)) ok_(isinstance(assure_unicode(u'grandchild_äöü東'), str)) eq_(assure_unicode('grandchild_äöü東'), u'grandchild_äöü東') # now, non-utf8 # Decoding could be deduced with high confidence when the string is # really encoded in that codepage mom_koi8r = u"мама".encode('koi8-r') eq_(assure_unicode(mom_koi8r), u"мама") eq_(assure_unicode(mom_koi8r, confidence=0.9), u"мама") mom_iso8859 = u'mamá'.encode('iso-8859-1') eq_(assure_unicode(mom_iso8859), u'mamá') eq_(assure_unicode(mom_iso8859, confidence=0.5), u'mamá') # but when we mix, it does still guess something allowing to decode: mixedin = mom_koi8r + u'東'.encode('iso2022_jp') + u'東'.encode('utf-8') ok_(isinstance(assure_unicode(mixedin), str)) # but should fail if we request high confidence result: with assert_raises(ValueError): assure_unicode(mixedin, confidence=0.9) # For other, non string values, actually just returns original value # TODO: RF to actually "assure" or fail?? For now hardcoding that assumption assert assure_unicode(1) == 1
def _any2unicode(val): return str_contructor(val) if isinstance(val, (int, float)) else assure_unicode(val)
def __call__(query=None, dataset=None, force_reindex=False, max_nresults=20, show_keys=False, show_query=False): from whoosh import qparser as qparse try: ds = require_dataset(dataset, check_installed=True, purpose='dataset search') if ds.id is None: raise NoDatasetArgumentFound( "This does not seem to be a dataset (no DataLad dataset ID " "found). 'datalad create --force %s' can initialize " "this repository as a DataLad dataset" % ds.path) except NoDatasetArgumentFound: for r in _search_from_virgin_install(dataset, query): yield r return # where does the bunny have the eggs? index_dir = opj(ds.path, get_git_dir(ds.path), 'datalad', 'search_index') idx_obj = _get_search_index(index_dir, ds, force_reindex) if show_keys: definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') try: defs = jsonload(gzopen(definitions_fname)) except Exception as e: lgr.warning( 'No term definitions found alongside search index: %s', exc_str(e)) defs = {} for k in idx_obj.schema.names(): print('{}{}'.format( k, ' {}'.format(defs[k] if isinstance(defs[k], dict) else '({})'.format(defs[k])) if k in defs else '')) return if not query: return with idx_obj.searcher() as searcher: # parse the query string, default whoosh parser ATM, could be # tailored with plugins parser = qparse.MultifieldParser(idx_obj.schema.names(), idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>:\w]+|[*]):")) # for convenience we accept any number of args-words from the # shell and put them together to a single string here querystr = ' '.join(assure_list(query)) # this gives a formal whoosh query wquery = parser.parse(querystr) if show_query: print(wquery) return # perform the actual search hits = searcher.search( wquery, terms=True, limit=max_nresults if max_nresults > 0 else None) # cheap way to get an approximate number of hits, without an expensive # scoring of all items # disabled: unreliable estimate, often confusing #nhits = hits.estimated_min_length() # report query stats topstr = '{} top {}'.format( max_nresults, single_or_plural('match', 'matches', max_nresults)) lgr.info('Query completed in {} sec.{}'.format( hits.runtime, ' Reporting {}.'.format(( 'up to ' + topstr) if max_nresults > 0 else 'all matches') if not hits.is_empty() else ' No matches.')) if not hits: return nhits = 0 for hit in hits: res = dict( action='search', status='ok', logger=lgr, refds=ds.path, # normpath to avoid trailing dot path=normpath(opj(ds.path, hit['path'])), query_matched={ assure_unicode(k): assure_unicode(v) if isinstance( v, unicode_srctypes) else v for k, v in hit.matched_terms() }, metadata={ k: v for k, v in hit.fields().items() if k not in ('path', 'parentds') }) if 'parentds' in hit: res['parentds'] = normpath(opj(ds.path, hit['parentds'])) yield res nhits += 1 if max_nresults and nhits == max_nresults: lgr.info("Reached the limit of {}, there could be more which " "were not reported.".format(topstr))
def test_wtf(topdir): path = opj(topdir, OBSCURE_FILENAME) # smoke test for now with swallow_outputs() as cmo: wtf(dataset=path, on_failure="ignore") assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # Those sections get sensored out by default now assert_not_in('user.name: ', cmo.out) with chpwd(path): with swallow_outputs() as cmo: wtf() assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # now with a dataset ds = create(path) with swallow_outputs() as cmo: wtf(dataset=ds.path) assert_in('## configuration', cmo.out) assert_in('## dataset', cmo.out) assert_in(u'path: {}'.format(ds.path), assure_unicode(cmo.out)) # and if we run with all sensitive for sensitive in ('some', True): with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive=sensitive) # we fake those for tests anyways, but we do show cfg in this mode # and explicitly not showing them assert_in('user.name: %s' % _HIDDEN, cmo.out) with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive='all') assert_not_in(_HIDDEN, cmo.out) # all is shown assert_in('user.name: ', cmo.out) # Sections selection # # If we ask for no sections and there is no dataset with chpwd(path): with swallow_outputs() as cmo: wtf(sections=[]) assert_not_in('## dataset', cmo.out) for s in SECTION_CALLABLES: assert_not_in('## %s' % s.lower(), cmo.out.lower()) # ask for a selected set secs = ['git-annex', 'configuration'] with chpwd(path): with swallow_outputs() as cmo: wtf(sections=secs) for s in SECTION_CALLABLES: (assert_in if s in secs else assert_not_in)('## %s' % s.lower(), cmo.out.lower()) # order should match our desired one, not alphabetical # but because of https://github.com/datalad/datalad/issues/3915 # alphanum is now desired assert cmo.out.index('## git-annex') > cmo.out.index( '## configuration') # not achievable from cmdline is to pass an empty list of sections. with chpwd(path): with swallow_outputs() as cmo: wtf(sections=[]) eq_(cmo.out.rstrip(), '# WTF') # and we could decorate it nicely for embedding e.g. into github issues with swallow_outputs() as cmo: wtf(sections=['dependencies'], decor='html_details') ok_startswith(cmo.out, '<details><summary>DataLad %s WTF' % __version__) assert_in('## dependencies', cmo.out) # should result only in '# WTF' skip_if_no_module('pyperclip') # verify that it works correctly in the env/platform import pyperclip with swallow_outputs() as cmo: try: pyperclip.copy("xxx") pyperclip_works = pyperclip.paste().strip() == "xxx" wtf(dataset=ds.path, clipboard=True) except (AttributeError, pyperclip.PyperclipException) as exc: # AttributeError could come from pyperclip if no DISPLAY raise SkipTest(exc_str(exc)) assert_in("WTF information of length", cmo.out) assert_not_in('user.name', cmo.out) if not pyperclip_works: # Some times does not throw but just fails to work raise SkipTest( "Pyperclip seems to be not functioning here correctly") assert_not_in('user.name', pyperclip.paste()) assert_in(_HIDDEN, pyperclip.paste()) # by default no sensitive info assert_in("cmd:annex:", pyperclip.paste()) # but the content is there
def _describe_datalad(): return { 'version': assure_unicode(__version__), 'full_version': assure_unicode(__full_version__), }
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] # which files to look for fname_match_regex = self.ds.config.get( 'datalad.metadata.xmp.fname-match', '.*(jpg|jpeg|pdf|gif|tiff|tif|ps|eps|png|mp3|mp4|avi|wav)$') fname_match_regex = re.compile(fname_match_regex) log_progress( lgr.info, 'extractorxmp', 'Start XMP metadata extraction from %s', self.ds, total=len(self.paths), label='XMP metadata extraction', unit=' Files', ) for f in self.paths: log_progress(lgr.info, 'extractorxmp', 'Extract XMP metadata from %s', f, update=1, increment=True) # run basic file name filter for performance reasons # it is OK to let false-positives through if fname_match_regex.match(f, re.IGNORECASE) is None: continue absfp = opj(self.ds.path, f) info = file_to_dict(absfp) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = { info[ns][0][0].split(':')[0]: { '@id': ns, 'type': vocabulary_id } for ns in info } # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actuall array values later continue # normalize value val = assure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items() } contentmeta.append((f, meta)) log_progress(lgr.info, 'extractorxmp', 'Finished XMP metadata extraction from %s', self.ds) return { '@context': context, }, \ contentmeta
def __call__(message=None, path=None, dataset=None, all_updated=True, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False, message_file=None ): if not dataset and not path: # we got nothing at all -> save what is staged in the repo in "this" directory? # make sure we don't treat this as a user-provided '.' argument path = [{'path': abspath(curdir), 'raw_input': False}] refds_path = Interface.get_refds_path(dataset) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_process = [] got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='save', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', modified='HEAD' if not path and recursive else None, return_type='generator', on_failure='ignore'): if ap.get('state', None) == 'untracked' and not ap.get('raw_input', False): # this path was found untracked, but not explicitly given to save # we will silently ignore this continue got_nothing = False # next check should not be done during annotation, as it is possibly expensive # and not generally useful if ap.get('status', None) == 'impossible' and \ ap.get('state', None) == 'absent' and \ ap.get('parentds', None): # this is not here anymore, but it might actually have been a deleted # component if relpath(ap['path'], start=ap['parentds']) \ in Dataset(ap['parentds']).repo.get_deleted_files(): # ok, this is a staged deletion that we want to save ap['status'] = '' del ap['message'] if ap.get('status', None): # this is done yield ap continue # for things like: `ds.save()` # or recursively discovered datasets if ap['path'] == refds_path or \ (ap.get('type', None) == 'dataset' and not ap.get('raw_input', False) and not ap.get('state', None) == 'absent'): ap['process_content'] = True ap['process_updated_only'] = all_updated to_process.append(ap) lgr.log(2, "save, to_process=%r", to_process) if got_nothing and recursive and refds_path: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'save', status='notneeded', path=refds_path, type='dataset', logger=lgr) return if not to_process: # nothing left to do, potentially all errored before return if super_datasets: # search for the topmost superdatasets of any path dss = [Dataset(ap.get('parentds', ap['path'])) for ap in to_process] superdss = [ds.get_superdataset(topmost=True) for ds in dss] superdss = get_tree_roots( unique(ds.path for ds in dss + superdss if ds)) if dataset: # need to adjust the reference to the new superds # if we had one ref before, we should still have exactly one assert len(superdss) <= 1 dataset = list(superdss.keys())[0] refds_path = dataset elif refds_path: # there is a single superdataset superdss = { refds_path: unique([ap['parentds'] for ap in to_process if 'parentds' in ap])} else: # sort all datasets under their potential superdatasets # start from the top to get all subdatasets down the line # and collate them into as few superdatasets as possible # this is quick, just string operations superdss = get_tree_roots( unique([ap['parentds'] for ap in to_process if 'parentds' in ap])) # for each "superdataset" check the tree of subdatasets and make sure # we gather all datasets between the super and any subdataset # so we can save them all bottom-up in order to be able to properly # save the superdataset # if this is called from e.g. `add` this is actually not necessary, # but in the general case we cannot avoid it # TODO maybe introduce a switch? discovered = {} for superds_path in superdss: target_subs = superdss[superds_path] discover_dataset_trace_to_targets( # from here superds_path, # to all target_subs, [], discovered) # create a new minimally annotated path for each discovered dataset discovered_added = set() for parentds in discovered: for subds in discovered[parentds]: to_process.append(dict( path=subds, parentds=parentds, type='dataset')) discovered_added.add(subds) # make sure we have an entry for each dataset, including those # tha are just parents for parentds in discovered: if parentds not in discovered_added: to_process.append(dict( path=parentds, type='dataset', # make sure we save content of superds later on process_content=True, # but not do nasty things, like adding untracked content # just because we discovered this dataset process_updated_only=True)) # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, deduplication happens here too annotated_paths = AnnotatePaths.__call__( path=to_process, dataset=dataset, # never recursion, done already recursive=False, action='save', unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') # now sort into datasets so we can process them one by one content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) # iterate over all datasets, starting at the bottom for dspath in sorted(content_by_ds.keys(), reverse=True): ds = Dataset(dspath) res = get_status_dict('save', ds=ds, logger=lgr) if not ds.is_installed(): # TODO This is likely impossible now res['status'] = 'impossible' res['message'] = ('dataset %s is not installed', ds) yield res continue saved_state = save_dataset( ds, content_by_ds[dspath], message=message) res['status'] = 'ok' if saved_state else 'notneeded' # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: try: # TODO: check whether comment below is still true after # removing the log swallowing: # again cannot help but force-silence low-level code, because # it screams like a made man instead of allowing top-level # code an orderly error report ds.repo.tag(version_tag) # even if we haven't saved anything res['status'] = 'ok' yield res except CommandError as e: if saved_state: # first we yield the result for the actual save yield res # and now complain that tagging didn't work yield get_status_dict( 'save', ds=ds, logger=lgr, status='error', message=( 'cannot tag this version: %s', e.stderr.strip())) else: yield res
def __call__(dataset=None, sensitive=None, sections=None, decor=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetFound: # failure is already logged pass if ds and not ds.is_installed(): # warn that the dataset is bogus yield dict( action='wtf', path=ds.path, status='impossible', message=('No dataset found at %s. Reporting on the dataset is ' 'not attempted.', ds.path), logger=lgr) # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = OrderedDict() res = get_status_dict( action='wtf', path=ds.path if ds else assure_unicode(op.abspath(op.curdir)), type='dataset' if ds else 'directory', status='ok', logger=lgr, decor=decor, infos=infos, ) # Define section callables which require variables. # so there is no side-effect on module level original section_callables = SECTION_CALLABLES.copy() section_callables['location'] = partial(_describe_location, res) section_callables['configuration'] = \ partial(_describe_configuration, cfg, sensitive) if ds: section_callables['dataset'] = \ partial(_describe_dataset, ds, sensitive) else: section_callables.pop('dataset') assert all(section_callables.values()) # check if none was missed if sections is None: sections = sorted(list(section_callables)) for s in sections: infos[s] = section_callables[s]() if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def show_keys(self, mode=None): maxl = 100 # maximal line length for unique values in mode=short # use a dict already, later we need to map to a definition # meanwhile map to the values class key_stat: def __init__(self): self.ndatasets = 0 # how many datasets have this field self.uvals = set() from collections import defaultdict keys = defaultdict(key_stat) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # inject a few basic properties into the dict # analog to what the other modes do in their index meta.update({ k: res.get(k, None) for k in ('@id', 'type', 'path', 'parentds') # parentds is tricky all files will have it, but the dataset # queried above might not (single dataset), let's force it in if k == 'parentds' or k in res }) # no stringification of values for speed idxd = _meta2autofield_dict(meta, val2str=False) for k, kvals in iteritems(idxd): # TODO deal with conflicting definitions when available keys[k].ndatasets += 1 if mode == 'name': continue try: kvals_set = assure_iter(kvals, set) except TypeError: # TODO: may be do show hashable ones??? nunhashable = sum( isinstance(x, collections.Hashable) for x in kvals) kvals_set = { 'unhashable %d out of %d entries' % (nunhashable, len(kvals)) } keys[k].uvals |= kvals_set for k in sorted(keys): if mode == 'name': print(k) continue # do a bit more stat = keys[k] uvals = stat.uvals if mode == 'short': # show only up to X uvals if len(stat.uvals) > 10: uvals = {v for i, v in enumerate(uvals) if i < 10} # all unicode still scares yoh -- he will just use repr # def conv(s): # try: # return '{}'.format(s) # except UnicodeEncodeError: # return assure_unicode(s).encode('utf-8') stat.uvals_str = assure_unicode("{} unique values: {}".format( len(stat.uvals), ', '.join(map(repr, uvals)))) if mode == 'short': if len(stat.uvals) > 10: stat.uvals_str += ', ...' if len(stat.uvals_str) > maxl: stat.uvals_str = stat.uvals_str[:maxl - 4] + ' ....' elif mode == 'full': pass else: raise ValueError( "Unknown value for stats. Know full and short") print('{k}\n in {stat.ndatasets} datasets\n has {stat.uvals_str}'. format(k=k, stat=stat))
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass except ValueError as e: if 'unsupported pickle protocol' in str(e): lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) else: raise lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata(get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 log_progress( lgr.info, 'autofieldidxbuild', 'Start building search index', total=len(dsinfo), label='Building search index', unit=' Datasets', ) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) log_progress(lgr.info, 'autofieldidxbuild', 'Indexed dataset at %s', old_ds_rpath, update=1, increment=True) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) log_progress(lgr.info, 'autofieldidxbuild', 'Done building search index') # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj
def _describe_datalad(): return { 'version': assure_unicode(__version__), 'full_version': assure_unicode(__full_version__), }
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])