def to_str(self, include_output=True): from datalad.utils import ( ensure_unicode, ensure_list, quote_cmdlinearg, ) to_str = "{}: ".format(self.__class__.__name__) if self.cmd: to_str += "'{}'".format( # go for a compact, normal looking, properly quoted # command rendering ' '.join(quote_cmdlinearg(c) for c in ensure_list(self.cmd))) if self.code: to_str += " failed with exitcode {}".format(self.code) if self.cwd: # only if not under standard PWD to_str += " under {}".format(self.cwd) if self.msg: # typically a command error has no specific idea to_str += " [{}]".format(ensure_unicode(self.msg)) if not include_output: return to_str if self.stdout: to_str += " [out: '{}']".format( ensure_unicode(self.stdout).strip()) if self.stderr: to_str += " [err: '{}']".format( ensure_unicode(self.stderr).strip()) if self.kwargs: to_str += " [info keys: {}]".format(', '.join(self.kwargs.keys())) return to_str
def normalize_command(command): """Convert `command` to the string representation. """ if isinstance(command, list): command = list(map(ensure_unicode, command)) if len(command) == 1 and command[0] != "--": # This is either a quoted compound shell command or a simple # one-item command. Pass it as is. # # FIXME: This covers the predominant command-line case, but, for # Python API callers, it means values like ["./script with spaces"] # requires additional string-like escaping, which is inconsistent # with the handling of multi-item lists (and subprocess's # handling). Once we have a way to detect "running from Python API" # (discussed in gh-2986), update this. command = command[0] else: if command and command[0] == "--": # Strip disambiguation marker. Note: "running from Python API" # FIXME from below applies to this too. command = command[1:] command = join_cmdline(command) else: command = ensure_unicode(command) return command
def get_extracted_files(self): """Generator to provide filenames which are available under extracted archive """ path = self.assure_extracted() path_len = len(path) + (len(os.sep) if not path.endswith(os.sep) else 0) for root, dirs, files in os.walk(path): # TEMP for name in files: yield ensure_unicode(opj(root, name)[path_len:])
def __call__(path=None, fr='HEAD', to=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None): yield from diff_dataset(dataset=dataset, fr=ensure_unicode(fr), to=ensure_unicode(to), constant_refs=False, path=path, annex=annex, untracked=untracked, recursive=recursive, recursion_limit=recursion_limit)
def _log_summary(self, fd, data): fd_name = self.fd_infos[fd][0] lgr.log(5, 'Read %i bytes from %i[%s]%s', len(data), self.process.pid, fd_name, ':' if self._log_outputs else '') if self._log_outputs: log_data = ensure_unicode(data) # The way we log is to stay consistent with Runner. # TODO: later we might just log in a single entry, without # fd_name prefix lgr.log(5, "%s| %s ", fd_name, log_data)
def to_str(self, include_output=True): from datalad.utils import ( ensure_unicode, join_cmdline, ) to_str = "{}: ".format(self.__class__.__name__) cmd = self.cmd if cmd: to_str += "'{}'".format( # go for a compact, normal looking, properly quoted # command rendering if the command is in list form join_cmdline(cmd) if isinstance(cmd, list) else cmd) if self.code: to_str += " failed with exitcode {}".format(self.code) if self.cwd: # only if not under standard PWD to_str += " under {}".format(self.cwd) if self.msg: # typically a command error has no specific idea to_str += " [{}]".format(ensure_unicode(self.msg)) if not include_output: return to_str if self.stdout: to_str += " [out: '{}']".format( ensure_unicode(self.stdout).strip()) if self.stderr: to_str += " [err: '{}']".format( ensure_unicode(self.stderr).strip()) if self.kwargs: if 'stdout_json' in self.kwargs: src_keys = ('note', 'error-messages') from datalad.utils import unique json_errors = unique('; '.join( str(m[key]) for key in src_keys if m.get(key)) for m in self.kwargs['stdout_json'] if any( m.get(k) for k in src_keys)) if json_errors: to_str += " [errors from JSON records: {}]".format( json_errors) to_str += " [info keys: {}]".format(', '.join(self.kwargs.keys())) return to_str
def test__version__(): # in released stage, version in the last CHANGELOG entry # should correspond to the one in datalad CHANGELOG_filename = op.join(op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md') if not op.exists(CHANGELOG_filename): raise SkipTest("no %s found" % CHANGELOG_filename) regex = re.compile(r'^## ' r'(?P<version>[0-9]+\.[0-9.abcrc~]+)\s+' r'\((?P<date>.*)\)' r'\s+--\s+' r'(?P<codename>.+)') with open(CHANGELOG_filename, 'rb') as f: for line in f: line = line.rstrip() if not line.startswith(b'## '): # The first section header we hit, must be our changelog entry continue reg = regex.match(ensure_unicode(line)) if not reg: # first one at that level is the one raise AssertionError( "Following line must have matched our regex: %r" % line) regd = reg.groupdict() changelog_version = regd['version'] lv_changelog_version = LooseVersion(changelog_version) # we might have a suffix - sanitize san__version__ = __version__.rstrip('.devdirty') lv__version__ = LooseVersion(san__version__) if '???' in regd['date'] and 'will be better than ever' in regd[ 'codename']: # we only have our template # we can only assert that its version should be higher than # the one we have now assert_greater(lv_changelog_version, lv__version__) else: # should be a "release" record assert_not_in('???', regd['date']) assert_not_in('will be better than ever', regd['codename']) assert_equal(__hardcoded_version__, changelog_version) if __hardcoded_version__ != san__version__: # It was not tagged yet and Changelog should have its # template record for the next release assert_greater(lv_changelog_version, lv__version__) assert_in('.dev', san__version__) else: # all is good, tagged etc assert_equal(lv_changelog_version, lv__version__) assert_equal(changelog_version, san__version__) assert_equal(__hardcoded_version__, san__version__) return raise AssertionError("No log line matching our regex found in %s" % CHANGELOG_filename)
def generate_token(self, user=None, password=None): data = {'username': user, 'password': password} encoded_data = json.dumps(data).encode('utf-8') request = Request(self.url, encoded_data) try: response = urlopen(request) except HTTPError: raise AccessDeniedError("Could not authenticate into LORIS") str_response = ensure_unicode(response.read()) data = json.loads(str_response) return data["token"]
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = [ 'git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked) ] try: stdout = GitWitlessRunner(cwd=dspath).run( cmd, protocol=StdOutErrCapture)['stdout'] except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None from datalad.utils import ensure_unicode for line in stdout.split('\0'): if not line: continue line = ensure_unicode(line) if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any(norm_apath == p or path_startswith(apath, p) for p in paths): # we got a whitelist for paths, don't report any other continue ap = dict(path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def get_requested_error_output(self, return_stderr: bool): if not self.runner: return None stderr_content = ensure_unicode(self.stderr_output) if lgr.isEnabledFor(5): from . import cfg if cfg.getbool("datalad.log", "outputs", default=False): stderr_lines = stderr_content.splitlines() lgr.log( 5, "stderr of %s had %d lines:", self.generator.runner.process.pid, len(stderr_lines)) for line in stderr_lines: lgr.log(5, "| " + line) if return_stderr: return stderr_content return None
def test_inputs_quotes_needed(path): ds = Dataset(path).create(force=True) ds.save() cmd = "import sys; open(sys.argv[-1], 'w').write('!'.join(sys.argv[1:]))" # The string form of a command works fine when the inputs/outputs have # spaces ... cmd_str = "{} -c \"{}\" {{inputs}} {{outputs[0]}}".format( sys.executable, cmd) ds.run(cmd_str, inputs=["*.t*"], outputs=["out0"], expand="inputs") expected = u"!".join( list(sorted([OBSCURE_FILENAME + u".t", "bar.txt", "foo blah.txt"])) + ["out0"]) with open(op.join(path, "out0")) as ifh: eq_(ensure_unicode(ifh.read()), expected) # ... but the list form of a command does not. (Don't test this failure # with the obscure file name because we'd need to know its composition to # predict the failure.) cmd_list = [sys.executable, "-c", cmd, "{inputs}", "{outputs[0]}"] ds.run(cmd_list, inputs=["*.txt"], outputs=["out0"]) ok_file_has_content(op.join(path, "out0"), "bar.txt foo!blah.txt!out0")
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def myopen(path, *args, **kwargs): if skip_regex and re.search(skip_regex, ensure_unicode(path)): return _builtins_open(path, *args, **kwargs) else: return myfile
def get_metadata(self, dataset, content): if not content: return {}, [] context = {} contentmeta = [] # which files to look for fname_match_regex = self.ds.config.get( 'datalad.metadata.xmp.fname-match', '.*(jpg|jpeg|pdf|gif|tiff|tif|ps|eps|png|mp3|mp4|avi|wav)$') fname_match_regex = re.compile(fname_match_regex) log_progress( lgr.info, 'extractorxmp', 'Start XMP metadata extraction from %s', self.ds, total=len(self.paths), label='XMP metadata extraction', unit=' Files', ) for f in self.paths: log_progress(lgr.info, 'extractorxmp', 'Extract XMP metadata from %s', f, update=1, increment=True) # run basic file name filter for performance reasons # it is OK to let false-positives through if fname_match_regex.match(f, re.IGNORECASE) is None: continue absfp = opj(self.ds.path, f) info = file_to_dict(absfp) if not info: # got nothing, likely nothing there # TODO check if this is an XMP sidecar file, parse that, and assign metadata # to the base file continue # update vocabulary vocab = { info[ns][0][0].split(':')[0]: { '@id': ns, 'type': vocabulary_id } for ns in info } # TODO this is dirty and assumed that XMP is internally consistent with the # definitions across all files -- which it likely isn't context.update(vocab) # now pull out actual metadata # cannot do simple dict comprehension, because we need to beautify things a little meta = {} for ns in info: for key, val, props in info[ns]: if not val: # skip everything empty continue if key.count('[') > 1: # this is a nested array # MIH: I do not think it is worth going here continue if props['VALUE_IS_ARRAY']: # we'll catch the actual array values later continue # normalize value val = ensure_unicode(val) # non-breaking space val = val.replace(u"\xa0", ' ') field, idx, qual = xmp_field_re.match(key).groups() normkey = u'{}{}'.format(field, qual) if '/' in key: normkey = u'{0}<{1}>'.format(*normkey.split('/')) if idx: # array arr = meta.get(normkey, []) arr.append(val) meta[normkey] = arr else: meta[normkey] = val # compact meta = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items() } contentmeta.append((f, meta)) log_progress(lgr.info, 'extractorxmp', 'Finished XMP metadata extraction from %s', self.ds) return { '@context': context, }, \ contentmeta
def _describe_datalad(): return { 'version': ensure_unicode(__version__), 'full_version': ensure_unicode(__full_version__), }
def __call__(dataset=None, sensitive=None, sections=None, flavor="full", decor=None, clipboard=None): from datalad.distribution.dataset import require_dataset from datalad.support.exceptions import NoDatasetFound from datalad.interface.results import get_status_dict ds = None try: ds = require_dataset(dataset, check_installed=False, purpose='reporting') except NoDatasetFound: # failure is already logged pass if ds and not ds.is_installed(): # warn that the dataset is bogus yield dict( action='wtf', path=ds.path, status='impossible', message=('No dataset found at %s. Reporting on the dataset is ' 'not attempted.', ds.path), logger=lgr) # we don't deal with absent datasets ds = None if sensitive: if ds is None: from datalad import cfg else: cfg = ds.config else: cfg = None from datalad.ui import ui from datalad.support.external_versions import external_versions infos = OrderedDict() res = get_status_dict( action='wtf', path=ds.path if ds else ensure_unicode(op.abspath(op.curdir)), type='dataset' if ds else 'directory', status='ok', logger=lgr, decor=decor, infos=infos, flavor=flavor, ) # Define section callables which require variables. # so there is no side-effect on module level original section_callables = SECTION_CALLABLES.copy() section_callables['location'] = partial(_describe_location, res) section_callables['configuration'] = \ partial(_describe_configuration, cfg, sensitive) if ds: section_callables['dataset'] = \ partial(_describe_dataset, ds, sensitive) else: section_callables.pop('dataset') assert all(section_callables.values()) # check if none was missed asked_for_all_sections = sections is not None and any( s == '*' for s in sections) if sections is None or asked_for_all_sections: if flavor == 'full' or asked_for_all_sections: sections = sorted(list(section_callables)) elif flavor == 'short': sections = ['datalad', 'dependencies'] else: raise ValueError(flavor) for s in sections: infos[s] = section_callables[s]() if clipboard: external_versions.check( 'pyperclip', msg="It is needed to be able to use clipboard") import pyperclip report = _render_report(res) pyperclip.copy(report) ui.message("WTF information of length %s copied to clipboard" % len(report)) yield res return
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be an item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = ensure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set(t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection(downward_targets)) undiscovered_ds = [ t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds ] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(undiscovered_ds)
def test_wtf(topdir): path = opj(topdir, OBSCURE_FILENAME) # smoke test for now with swallow_outputs() as cmo: wtf(dataset=path, on_failure="ignore") assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # Those sections get sensored out by default now assert_not_in('user.name: ', cmo.out) with chpwd(path): with swallow_outputs() as cmo: wtf() assert_not_in('## dataset', cmo.out) assert_in('## configuration', cmo.out) # now with a dataset ds = create(path) with swallow_outputs() as cmo: wtf(dataset=ds.path) assert_in('## configuration', cmo.out) assert_in('## dataset', cmo.out) assert_in(u'path: {}'.format(ds.path), ensure_unicode(cmo.out)) # and if we run with all sensitive for sensitive in ('some', True): with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive=sensitive) # we fake those for tests anyways, but we do show cfg in this mode # and explicitly not showing them assert_in('user.name: %s' % _HIDDEN, cmo.out) with swallow_outputs() as cmo: wtf(dataset=ds.path, sensitive='all') assert_not_in(_HIDDEN, cmo.out) # all is shown assert_in('user.name: ', cmo.out) # Sections selection # # If we ask for no sections and there is no dataset with chpwd(path): with swallow_outputs() as cmo: wtf(sections=[]) assert_not_in('## dataset', cmo.out) for s in SECTION_CALLABLES: assert_not_in('## %s' % s.lower(), cmo.out.lower()) # ask for a selected set secs = ['git-annex', 'configuration'] with chpwd(path): with swallow_outputs() as cmo: wtf(sections=secs) for s in SECTION_CALLABLES: (assert_in if s in secs else assert_not_in)( '## %s' % s.lower(), cmo.out.lower() ) # order should match our desired one, not alphabetical # but because of https://github.com/datalad/datalad/issues/3915 # alphanum is now desired assert cmo.out.index('## git-annex') > cmo.out.index('## configuration') # not achievable from cmdline is to pass an empty list of sections. with chpwd(path): with swallow_outputs() as cmo: wtf(sections=[]) eq_(cmo.out.rstrip(), '# WTF') # and we could decorate it nicely for embedding e.g. into github issues with swallow_outputs() as cmo: wtf(sections=['dependencies'], decor='html_details') ok_startswith(cmo.out, '<details><summary>DataLad %s WTF' % __version__) assert_in('## dependencies', cmo.out) # short flavor with swallow_outputs() as cmo: wtf(flavor='short') assert_in("- datalad: version=%s" % __version__, cmo.out) assert_in("- dependencies: ", cmo.out) eq_(len(cmo.out.splitlines()), 4) # #WTF, datalad, dependencies, trailing new line with swallow_outputs() as cmo: wtf(flavor='short', sections='*') assert_greater(len(cmo.out.splitlines()), 10) # many more # should result only in '# WTF' skip_if_no_module('pyperclip') # verify that it works correctly in the env/platform import pyperclip with swallow_outputs() as cmo: try: pyperclip.copy("xxx") pyperclip_works = pyperclip.paste().strip() == "xxx" wtf(dataset=ds.path, clipboard=True) except (AttributeError, pyperclip.PyperclipException) as exc: # AttributeError could come from pyperclip if no DISPLAY raise SkipTest(exc_str(exc)) assert_in("WTF information of length", cmo.out) assert_not_in('user.name', cmo.out) if not pyperclip_works: # Some times does not throw but just fails to work raise SkipTest( "Pyperclip seems to be not functioning here correctly") assert_not_in('user.name', pyperclip.paste()) assert_in(_HIDDEN, pyperclip.paste()) # by default no sensitive info assert_in("cmd:annex:", pyperclip.paste()) # but the content is there
def to_str(self): return ("MetadataKeyException(" + ensure_unicode(self.message) + ": " + ", ".join(map(ensure_unicode, self.keys)) + ")")