def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_('\033[' not in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # (...)? is added to swallow possible traceback logs regex = "\[ERROR\]" if EnsureBool()(cfg.get('datalad.log.timestamp', False)): regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex if EnsureBool()(cfg.get('datalad.log.vmem', False)): regex += ' RSS/VMS: \S+/\S+( \S+)?\s*' regex += "(\s+\S+\s*)? " + msg assert_re_in(regex, line, match=True) # Close all handlers so windows is happy -- apparently not closed fast enough for handler in lgr.handlers: handler.close()
def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_('\033[' not in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # (...)? is added to swallow possible traceback logs regex = "\[ERROR\]" if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)): regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)): regex += ' RSS/VMS: \S+/\S+( \S+)?\s*' regex += "(\s+\S+\s*)? " + msg assert_re_in(regex, line, match=True) # Close all handlers so windows is happy -- apparently not closed fast enough for handler in lgr.handlers: handler.close()
def test_global_config(): # from within tests, global config should be read from faked $HOME (see # setup_package) glb_cfg_file = Path(os.environ['HOME']) / '.gitconfig' assert any(glb_cfg_file.samefile(Path(p)) for p in dl_cfg._cfgfiles) assert_equal(dl_cfg.get("user.name"), "DataLad Tester") assert_equal(dl_cfg.get("user.email"), "*****@*****.**")
def test_global_config(): # from within tests, global config should be read from faked $HOME (see # setup_package) glb_cfg_file = Path(os.path.expanduser('~')) / '.gitconfig' assert any( glb_cfg_file.samefile(Path(p)) for p in dl_cfg._stores['git']['files']) assert_equal(dl_cfg.get("user.name"), "DataLad Tester") assert_equal(dl_cfg.get("user.email"), "*****@*****.**")
def _ok_metadata(res, msrc, ds, loc): restype = res.get('type', None) if restype not in ('dataset', 'file'): # pragma: no cover # untested, would need broken extractor lgr.error( 'metadata report for something other than a file or dataset: %s', restype) return False meta = res.get('metadata', None) if meta is None or isinstance(meta, dict): return True else: # pragma: no cover # untested, needs broken extract # extractor msg = ("Metadata extractor '%s' yielded something other than a " "dictionary for dataset %s%s -- this is likely a bug, " "please consider reporting it. " "This type of native metadata will be ignored. Got: %s", msrc, ds, '' if loc is None else ' content {}'.format(loc), repr(meta)) if cfg.get('datalad.runtime.raiseonerror'): raise RuntimeError(*msg) lgr.error(*msg) return False
def test_install_dataset_from_just_source(src_repo=None, path=None): src_ds = Dataset(src_repo).create(result_renderer='disabled', force=True) src_ds.save(['INFO.txt', 'test.dat'], to_git=True) src_ds.save('test-annex.dat', to_git=False) # equivalent repo on github: src_url = "https://github.com/datalad/testrepo--basic--r1.git" sources = [ src_ds.path, get_local_file_url(src_ds.path, compatibility='git') ] if not dl_cfg.get('datalad.tests.nonetwork'): sources.append(src_url) for url in sources: with chpwd(path, mkdir=True): ds = install(source=url) ok_startswith(ds.path, path) ok_(ds.is_installed()) ok_(GitRepo.is_valid_repo(ds.path)) assert_repo_status(ds.path, annex=None) assert_in('INFO.txt', ds.repo.get_indexed_files()) # cleanup before next iteration rmtree(path)
def test_ExtractedArchive(path): archive = op.join(path, fn_archive_obscure_ext) earchive = ExtractedArchive(archive) assert_false(op.exists(earchive.path)) # no longer the case -- just using hash for now # assert_in(os.path.basename(archive), earchive.path) fpath = op.join( fn_archive_obscure, # lead directory fn_in_archive_obscure) extracted = earchive.get_extracted_filename(fpath) eq_(extracted, op.join(earchive.path, fpath)) assert_false(op.exists(extracted)) # not yet extracted_ = earchive.get_extracted_file(fpath) eq_(extracted, extracted_) assert_true(op.exists(extracted)) # now it should extracted_files = earchive.get_extracted_files() ok_generator(extracted_files) eq_( sorted(extracted_files), sorted([ # ['bbc/3.txt', 'bbc/abc'] op.join(fn_archive_obscure, fn_in_archive_obscure), op.join(fn_archive_obscure, '3.txt') ])) earchive.clean() if not dl_cfg.get('datalad.tests.temp.keep'): assert_false(op.exists(earchive.path))
def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_(not '\033[' in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # .* is added to swallow possible traceback logs if EnsureBool()(cfg.get('datalad.log.timestamp', False)): ok_( re.match( "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s" % msg, line)) else: ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg, line))
def test_logging_to_a_file(dst): ok_(not exists(dst)) lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst) ok_(exists(dst)) # nothing was logged -- no file created msg = "Oh my god, they killed Kenny" lgr.error(msg) with open(dst) as f: lines = f.readlines() assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines) line = lines[0] ok_(msg in line) ok_(not '\033[' in line, msg="There should be no color formatting in log files. Got: %s" % line) # verify that time stamp and level are present in the log line # do not want to rely on not having race conditions around date/time changes # so matching just with regexp # .* is added to swallow possible traceback logs if EnsureBool()(cfg.get('datalad.log.timestamp', False)): ok_(re.match("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s" % msg, line)) else: ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg, line))
def load_extensions(): """Load entrypoint for any configured extension package Log a warning in case a requested extension is not available, or if a requested extension fails on load. Extensions to load are taken from the 'datalad.extensions.load' configuration item. """ from datalad import cfg load_extensions = cfg.get('datalad.extensions.load', get_all=True) if load_extensions: from datalad.utils import ensure_list exts = { ename: eload for ename, _, eload in iter_entrypoints('datalad.extensions') } for el in ensure_list(load_extensions): if el not in exts: lgr.warning('Requested extension %r is not available', el) continue try: exts[el]() except Exception as e: ce = CapturedException(e) lgr.warning('Could not load extension %r: %s', el, ce)
def newfunc(*args, **kwargs): if on_windows: raise SkipTest("SSH currently not available on windows.") from datalad import cfg test_ssh = cfg.get("datalad.tests.ssh", '') if test_ssh in ('', '0', 'false', 'no'): raise SkipTest("Run this test by setting DATALAD_TESTS_SSH") return func(*args, **kwargs)
def test_update_fetch_all(path=None): path = Path(path) remote_1 = str(path / "remote_1") remote_2 = str(path / "remote_2") ds = Dataset(path / "src").create() src = ds.repo.path ds_rmt1 = clone(source=src, path=remote_1) ds_rmt2 = clone(source=src, path=remote_2) ds.siblings('add', name="sibling_1", url=remote_1) ds.siblings('add', name="sibling_2", url=remote_2) # modify the remotes: (ds_rmt1.pathobj / "first.txt").write_text("some file load") ds_rmt1.save() # TODO: Modify an already present file! (ds_rmt2.pathobj / "second.txt").write_text("different file load") ds_rmt2.save() # Let's init some special remote which we couldn't really update/fetch if not dl_cfg.get('datalad.tests.dataladremote'): ds.repo.init_remote( 'datalad', ['encryption=none', 'type=external', 'externaltype=datalad']) # fetch all remotes assert_result_count(ds.update(), 1, status='ok', type='dataset') # no merge, so changes are not in active branch: assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # but we know the changes in remote branches: assert_in("first.txt", ds.repo.get_files("sibling_1/" + DEFAULT_BRANCH)) assert_in("second.txt", ds.repo.get_files("sibling_2/" + DEFAULT_BRANCH)) # no merge strategy for multiple remotes yet: # more clever now, there is a tracking branch that provides a remote #assert_raises(NotImplementedError, ds.update, merge=True) # merge a certain remote: assert_result_count(ds.update(sibling='sibling_1', merge=True), 1, action='update', status='ok', type='dataset') # changes from sibling_2 still not present: assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # changes from sibling_1 merged: assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) # it's known to annex, but has no content yet: annexprops = ds.repo.get_file_annexinfo("first.txt", eval_availability=True) annexprops['key'] # blows if unknown eq_(False, annexprops['has_content'])
def get_connection(self, url, use_remote_annex_bundle=True, force_ip=False): """Get a singleton, representing a shared ssh connection to `url` Parameters ---------- url: str ssh url force_ip : {False, 4, 6} Force the use of IPv4 or IPv6 addresses. Returns ------- SSHConnection """ # parse url: from datalad.support.network import RI, is_ssh if isinstance(url, RI): sshri = url else: if ':' not in url and '/' not in url: # it is just a hostname lgr.debug("Assuming %r is just a hostname for ssh connection", url) url += ':' sshri = RI(url) if not is_ssh(sshri): raise ValueError("Unsupported SSH URL: '{0}', use " "ssh://host/path or host:path syntax".format(url)) from datalad import cfg identity_file = cfg.get("datalad.ssh.identityfile") conhash = get_connection_hash( sshri.hostname, port=sshri.port, identity_file=identity_file or "", username=sshri.username, bundled=use_remote_annex_bundle, force_ip=force_ip, ) # determine control master: ctrl_path = self.socket_dir / conhash # do we know it already? if ctrl_path in self._connections: return self._connections[ctrl_path] else: c = SSHConnection(ctrl_path, sshri, identity_file=identity_file, use_remote_annex_bundle=use_remote_annex_bundle, force_ip=force_ip) self._connections[ctrl_path] = c return c
def _get_format(self, log_name=False, log_pid=False): from datalad import cfg from datalad.config import anything2bool show_timestamps = anything2bool(cfg.get('datalad.log.timestamp', False)) return (("" if not show_timestamps else "$BOLD%(asctime)-15s$RESET ") + ("%(name)-15s " if log_name else "") + ("{%(process)d}" if log_pid else "") + "[%(levelname)s] " "%(message)s ")
def test_update_fetch_all(src, remote_1, remote_2): rmt1 = AnnexRepo.clone(src, remote_1) rmt2 = AnnexRepo.clone(src, remote_2) ds = Dataset(src) ds.siblings('add', name="sibling_1", url=remote_1) ds.siblings('add', name="sibling_2", url=remote_2) # modify the remotes: with open(opj(remote_1, "first.txt"), "w") as f: f.write("some file load") rmt1.add("first.txt") rmt1.commit() # TODO: Modify an already present file! with open(opj(remote_2, "second.txt"), "w") as f: f.write("different file load") rmt2.add("second.txt", git=True) rmt2.commit(msg="Add file to git.") # Let's init some special remote which we couldn't really update/fetch if not dl_cfg.get('datalad.tests.dataladremote'): ds.repo.init_remote( 'datalad', ['encryption=none', 'type=external', 'externaltype=datalad']) # fetch all remotes assert_result_count( ds.update(), 1, status='ok', type='dataset') # no merge, so changes are not in active branch: assert_not_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # but we know the changes in remote branches: assert_in("first.txt", ds.repo.get_files("sibling_1/" + DEFAULT_BRANCH)) assert_in("second.txt", ds.repo.get_files("sibling_2/" + DEFAULT_BRANCH)) # no merge strategy for multiple remotes yet: # more clever now, there is a tracking branch that provides a remote #assert_raises(NotImplementedError, ds.update, merge=True) # merge a certain remote: assert_result_count( ds.update(sibling='sibling_1', merge=True), 1, action='update', status='ok', type='dataset') # changes from sibling_2 still not present: assert_not_in("second.txt", ds.repo.get_files(ds.repo.get_active_branch())) # changes from sibling_1 merged: assert_in("first.txt", ds.repo.get_files(ds.repo.get_active_branch())) # it's known to annex, but has no content yet: ds.repo.get_file_key("first.txt") # raises if unknown eq_([False], ds.repo.file_has_content(["first.txt"]))
def _get_plugin_specs(param_key=None, cfg_key=None): spec = common_params.get(param_key, None) if spec is not None: # this is already a list of lists return spec spec = dlcfg.get(cfg_key, None) if spec is None: return elif not isinstance(spec, tuple): spec = [spec] return [shlex.split(s) for s in spec]
def _get_github_entity(gh, cred, github_user, github_passwd, github_organization): # figure out authentication if not (github_user and github_passwd): # access to the system secrets if github_user: # check that they keystore knows about this user if github_user != cred.get('user', github_user): # there is a mismatch, we need to ask creds = cred.enter_new() github_user = creds['user'] github_passwd = creds['password'] # if a user is provided, go with it, don't even ask any store if github_user is None and not cred.is_known: # let's figure out authentication if github_user is None: # check if there is an oauth token from # https://github.com/sociomantic/git-hub github_user = cfg.get('hub.oauthtoken', None) if github_user is None: # still nothing, ask if necessary creds = cred() github_user = creds['user'] github_passwd = creds['password'] if not github_user: raise gh.BadCredentialsException(403, 'no user specified') # this will always succeed, but it might later throw an exception # if the credentials were wrong # XXX make sure to wipe out known credentials if that happens authed_gh = gh.Github( github_user, password=github_passwd) try: if github_organization: try: entity = authed_gh.get_organization(github_organization) except gh.UnknownObjectException as e: raise ValueError('unknown organization "{}" [{}]'.format( github_organization, exc_str(e))) else: entity = authed_gh.get_user() except gh.BadCredentialsException as e: # things blew up, wipe out cred store, if anything is in it if cred.is_known: cred.delete() raise e return entity
def _run_extractor(extractor_cls, name, ds, refcommit, status, process_type): """Helper to control extractor using the right API Central switch to deal with alternative/future APIs is inside """ try: # detect supported API and interface as needed if issubclass(extractor_cls, MetadataExtractor): # new-style, command-like extractors extractor = extractor_cls() for r in extractor(dataset=ds, refcommit=refcommit, status=status, process_type=process_type): yield r elif hasattr(extractor_cls, 'get_metadata'): # pragma: no cover # old-style, keep around for a while, but don't sweat over it much for res in _yield_res_from_pre2019_extractor( ds, name, extractor_cls, process_type, # old extractors only take a list of relative paths # and cannot benefit from outside knowledge # TODO avoid is_installed() call [ text_type(Path(p['path']).relative_to(ds.pathobj)) if ds.is_installed() else p['path'] for p in status ]): yield res else: # pragma: no cover raise RuntimeError( '{} does not have a recognised extractor API'.format( extractor_cls)) except Exception as e: # pragma: no cover if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', name, ds, ) raise yield get_status_dict( ds=ds, # any errors will have been reported before status='error', message=('Failed to get %s metadata (%s): %s', ds, name, exc_str(e)), )
def _get_github_entity(gh, cred, github_login, github_passwd, github_organization): if github_login == 'disabledloginfortesting': raise gh.BadCredentialsException(403, 'no login specified') if not (github_login and github_passwd): # we don't have both # check if there is an oauth token from # https://github.com/sociomantic/git-hub token = False if not cred.is_known: if not github_login: # try find a token as login github_login = cfg.get('hub.oauthtoken', None) token = True if not (github_login and (github_passwd or token)): # still at least one missing, utilize the credential store # to get auth info, pass potential passwd value along cred.enter_new( user=github_login, password=github_passwd) # now we should really have it creds = cred() github_login = creds['user'] github_passwd = creds['password'] if not github_login: raise gh.BadCredentialsException(403, 'no login specified') # this will always succeed, but it might later throw an exception # if the credentials were wrong # and this case, known credentials are wiped out again below authed_gh = gh.Github( github_login, password=github_passwd) try: if github_organization: try: entity = authed_gh.get_organization(github_organization) except gh.UnknownObjectException as e: raise ValueError('unknown organization "{}" [{}]'.format( github_organization, exc_str(e))) else: entity = authed_gh.get_user() except gh.BadCredentialsException as e: # things blew up, wipe out cred store, if anything is in it if cred.is_known: cred.delete() raise e return entity
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return len(ds.repo.get_revisions("git-annex")) n_annex_commits = get_annex_commit_counts() ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(op.join(ds.path, fname)) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. if not dl_cfg.get('datalad.fake-dates'): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open(op.join(ds.path, "a"), "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def _ok_metadata(meta, mtype, ds, loc): if meta is None or isinstance(meta, dict): return True msg = ("Metadata extractor '%s' yielded something other than a dictionary " "for dataset %s%s -- this is likely a bug, please consider " "reporting it. " "This type of native metadata will be ignored. Got: %s", mtype, ds, '' if loc is None else ' content {}'.format(loc), repr(meta)) if cfg.get('datalad.runtime.raiseonerror'): raise RuntimeError(*msg) lgr.error(*msg) return False
def get_cached_url_content(url, name=None, fetcher=None, maxage=None): """Loader of a document from a url, which caches loaded instance on disk Doesn't do anything smart about http headers etc which could provide information for cache/proxy servers for how long to retain etc TODO: theoretically it is not network specific at all -- and just a memoize pattern, but may be some time we would make it treat headers etc correctly. And ATM would support any URL we support via providers/downloaders Parameters ---------- fetcher: callable, optional Function to call with url if needed to be refetched maxage: float, optional Age in days to retain valid for. <0 - would retain forever. If None - would consult the config, 0 - would force to reload """ doc_fname = get_url_cache_filename(url, name) if maxage is None: maxage = float(cfg.get('datalad.locations.cache-maxage')) doc = None if os.path.exists(doc_fname) and maxage != 0: fage = (time.time() - os.stat(doc_fname).st_mtime) / (24. * 3600) if maxage < 0 or fage < maxage: try: lgr.debug("use cached request result to '%s' from %s", url, doc_fname) doc = pickle.load(open(doc_fname, 'rb')) except Exception as e: # it is OK to ignore any error and fall back on the true source lgr.warning( "cannot load cache from '%s', fall back to download: %s", doc_fname, exc_str(e)) if doc is None: if fetcher is None: from datalad.downloaders.providers import Providers providers = Providers.from_config_files() fetcher = providers.fetch doc = fetcher(url) assure_dir(dirname(doc_fname)) # use pickle to store the entire request result dict pickle.dump(doc, open(doc_fname, 'wb')) lgr.debug("stored result of request to '{}' in {}".format( url, doc_fname)) return doc
def test_get_resolved_values(): from datalad.tests.utils_pytest import _get_resolved_flavors flavors = ['networkish', 'local'] eq_(([] if dl_cfg.get('datalad.tests.nonetwork') else ['networkish']) + ['local'], _get_resolved_flavors(flavors)) with patch_config({'datalad.tests.nonetwork': '1'}): eq_(_get_resolved_flavors(flavors), ['local']) # and one more to see the exception being raised if nothing to teston @with_testrepos(flavors=['network']) def magical(): raise AssertionError("Must not be ran") assert_raises(Skipped, magical)
def test_with_tempfile_mkdir(): dnames = [] # just to store the name within the decorated function @with_tempfile(mkdir=True) def check_mkdir(d1): ok_(os.path.exists(d1)) ok_(os.path.isdir(d1)) dnames.append(d1) eq_(glob(os.path.join(d1, '*')), []) # Create a file to assure we can remove later the temporary load with open(os.path.join(d1, "test.dat"), "w") as f: f.write("TEST LOAD") check_mkdir() if not dl_cfg.get('datalad.tests.temp.keep'): ok_(not os.path.exists(dnames[0])) # got removed
def get_cached_url_content(url, name=None, fetcher=None, maxage=None): """Loader of a document from a url, which caches loaded instance on disk Doesn't do anything smart about http headers etc which could provide information for cache/proxy servers for how long to retain etc TODO: theoretically it is not network specific at all -- and just a memoize pattern, but may be some time we would make it treat headers etc correctly. And ATM would support any URL we support via providers/downloaders Parameters ---------- fetcher: callable, optional Function to call with url if needed to be refetched maxage: float, optional Age in days to retain valid for. <0 - would retain forever. If None - would consult the config, 0 - would force to reload """ doc_fname = get_url_cache_filename(url, name) if maxage is None: maxage = float(cfg.get('datalad.locations.cache-maxage')) doc = None if os.path.exists(doc_fname) and maxage != 0: fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600) if maxage < 0 or fage < maxage: try: lgr.debug("use cached request result to '%s' from %s", url, doc_fname) doc = pickle.load(open(doc_fname, 'rb')) except Exception as e: # it is OK to ignore any error and fall back on the true source lgr.warning( "cannot load cache from '%s', fall back to download: %s", doc_fname, exc_str(e)) if doc is None: if fetcher is None: from datalad.downloaders.providers import Providers providers = Providers.from_config_files() fetcher = providers.fetch doc = fetcher(url) assure_dir(dirname(doc_fname)) # use pickle to store the entire request result dict pickle.dump(doc, open(doc_fname, 'wb')) lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname)) return doc
def _get_cnmeta(self, bids): # TODO any custom handling of participants infos should eventually # be done by pybids in one way or another path_props = {} participants_fname = opj(self.ds.path, 'participants.tsv') if exists(participants_fname): try: for rx, info in yield_participant_info(participants_fname): path_props[rx] = info except Exception as exc: lgr.warning( "Failed to load participants info due to: %s. Skipping the rest of file", exc_str(exc)) # now go over all files in the dataset and query pybids for its take # on each of them for f in self.paths: # BIDS carries a substantial portion of its metadata in JSON # sidecar files. we ignore them here completely # this might yield some false-negatives in theory, but # this case has not been observed in practice yet, hence # doing it cheap for now if f.endswith('.json'): continue md = {} try: md.update({ 'bids:{}'.format(k): v for k, v in bids.get_metadata(opj(self.ds.path, f)).items() # no nested structures for now (can be monstrous when DICOM # metadata is embedded) if not isinstance(v, dict) }) except Exception as e: lgr.debug('no usable BIDS metadata for %s in %s: %s', f, self.ds, exc_str(e)) if cfg.get('datalad.runtime.raiseonerror'): raise # no check al props from other sources and apply them for rx in path_props: if rx.match(f): md.update(path_props[rx]) yield f, md
def _ok_metadata(meta, mtype, ds, loc): if meta is None or isinstance(meta, dict): return True msg = ( "Metadata extractor '%s' yielded something other than a dictionary " "for dataset %s%s -- this is likely a bug, please consider " "reporting it. " "This type of native metadata will be ignored. Got: %s", mtype, ds, '' if loc is None else ' content {}'.format(loc), repr(meta)) if cfg.get('datalad.runtime.raiseonerror'): raise RuntimeError(*msg) lgr.error(*msg) return False
def test_with_testrepos(): repos = [] @with_testrepos def check_with_testrepos(repo): repos.append(repo) check_with_testrepos() eq_(len(repos), 2 if on_windows # TODO -- would fail now in DATALAD_TESTS_NONETWORK mode else (15 if dl_cfg.get('datalad.tests.nonetwork') else 16)) # local, local-url, clone, network for repo in repos: if not (repo.startswith('git://') or repo.startswith('http')): # either it is a "local" or a removed clone ok_(exists(opj(repo, '.git')) or not exists(opj(repo, '.git', 'remove-me')))
def _prep_connection_args(self, url): # parse url: from datalad.support.network import RI, is_ssh if isinstance(url, RI): sshri = url else: if ':' not in url and '/' not in url: # it is just a hostname lgr.debug("Assuming %r is just a hostname for ssh connection", url) url += ':' sshri = RI(url) if not is_ssh(sshri): raise ValueError("Unsupported SSH URL: '{0}', use " "ssh://host/path or host:path syntax".format(url)) from datalad import cfg identity_file = cfg.get("datalad.ssh.identityfile") return sshri, identity_file
def __init__(self, path=None, puke_if_exists=True): if not path: path = \ tempfile.mktemp(**get_tempfile_kwargs( {'dir': dl_cfg.get("datalad.tests.temp.dir")}, prefix='testrepo')) # to be removed upon teardown _TEMP_PATHS_GENERATED.append(path) if puke_if_exists and exists(path): raise RuntimeError("Directory %s for test repo already exist" % path) # swallow logs so we don't print all those about crippled FS etc with swallow_logs(): self.repo = self.REPO_CLASS(path) # For additional testing of our datalad remote to not interfer # and manage to handle all http urls and requests: if self.REPO_CLASS is AnnexRepo and \ os.environ.get('DATALAD_TESTS_DATALADREMOTE'): init_datalad_remote(self.repo, 'datalad', autoenable=True) self._created = False
def test_keeptemp_via_env_variable(): if dl_cfg.get('datalad.tests.temp.keep'): # pragma: no cover pytest.skip("We have env variable set to preserve tempfiles") files = [] @with_tempfile() def check(f): open(f, 'w').write("LOAD") files.append(f) with patch.dict('os.environ', {}): check() with patch.dict('os.environ', {'DATALAD_TESTS_TEMP_KEEP': '1'}): check() eq_(len(files), 2) ok_(not exists(files[0]), msg="File %s still exists" % files[0]) ok_(exists(files[1]), msg="File %s not exists" % files[1]) rmtemp(files[-1])
def eval_func(wrapped, instance, args, kwargs): # for result filters and pre/post procedures # we need to produce a dict with argname/argvalue pairs for all args # incl. defaults and args given as positionals allkwargs = get_allargs_as_kwargs(wrapped, args, kwargs) # determine class, the __call__ method of which we are decorating: # Ben: Note, that this is a bit dirty in PY2 and imposes restrictions on # when and how to use eval_results as well as on how to name a command's # module and class. As of now, we are inline with these requirements as # far as I'm aware. mod = sys.modules[wrapped.__module__] if PY2: # we rely on: # - decorated function is method of a subclass of Interface # - the name of the class matches the last part of the module's name # if converted to lower # for example: # ..../where/ever/mycommand.py: # class MyCommand(Interface): # @eval_results # def __call__(..) command_class_names = \ [i for i in mod.__dict__ if type(mod.__dict__[i]) == type and issubclass(mod.__dict__[i], Interface) and i.lower().startswith(wrapped.__module__.split('.')[-1].replace('datalad_', '').replace('_', ''))] assert len(command_class_names) == 1, (command_class_names, mod.__name__) command_class_name = command_class_names[0] else: command_class_name = wrapped.__qualname__.split('.')[-2] _func_class = mod.__dict__[command_class_name] lgr.debug("Determined class of decorated function: %s", _func_class) # retrieve common options from kwargs, and fall back on the command # class attributes, or general defaults if needed kwargs = kwargs.copy() # we will pop, which might cause side-effect common_params = { p_name: kwargs.pop( p_name, getattr(_func_class, p_name, eval_defaults[p_name])) for p_name in eval_params} # short cuts and configured setup for common options on_failure = common_params['on_failure'] return_type = common_params['return_type'] # resolve string labels for transformers too result_xfm = common_params['result_xfm'] if result_xfm in known_result_xfms: result_xfm = known_result_xfms[result_xfm] result_renderer = common_params['result_renderer'] # TODO remove this conditional branch entirely, done outside if not result_renderer: result_renderer = dlcfg.get('datalad.api.result-renderer', None) # wrap the filter into a helper to be able to pass additional arguments # if the filter supports it, but at the same time keep the required interface # as minimal as possible. Also do this here, in order to avoid this test # to be performed for each return value result_filter = common_params['result_filter'] _result_filter = result_filter if result_filter: if isinstance(result_filter, Constraint): _result_filter = result_filter.__call__ if (PY2 and inspect.getargspec(_result_filter).keywords) or \ (not PY2 and inspect.getfullargspec(_result_filter).varkw): def _result_filter(res): return result_filter(res, **allkwargs) def _get_procedure_specs(param_key=None, cfg_key=None, ds=None): spec = common_params.get(param_key, None) if spec is not None: # this is already a list of lists return spec from datalad.distribution.dataset import Dataset ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None spec = (ds.config if ds and ds.is_installed() else dlcfg).get(cfg_key, None) if spec is None: return elif not isinstance(spec, tuple): spec = [spec] return [shlex.split(s) for s in spec] # query cfg for defaults cmdline_name = cls2cmdlinename(_func_class) dataset_arg = allkwargs.get('dataset', None) proc_pre = _get_procedure_specs( 'proc_pre', 'datalad.{}.proc-pre'.format(cmdline_name), ds=dataset_arg) proc_post = _get_procedure_specs( 'proc_post', 'datalad.{}.proc-post'.format(cmdline_name), ds=dataset_arg) # this internal helper function actually drives the command # generator-style, it may generate an exception if desired, # on incomplete results def generator_func(*_args, **_kwargs): # flag whether to raise an exception incomplete_results = [] # track what actions were performed how many times action_summary = {} if proc_pre and cmdline_name != 'run-procedure': from datalad.interface.run_procedure import RunProcedure for procspec in proc_pre: lgr.debug('Running configured pre-procedure %s', procspec) for r in _process_results( RunProcedure.__call__( procspec, dataset=dataset_arg, return_type='generator'), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, result_filter, **_kwargs): yield r # if a custom summary is to be provided, collect the results # of the command execution results = [] do_custom_result_summary = result_renderer == 'tailored' \ and hasattr(_func_class, 'custom_result_summary_renderer') # process main results for r in _process_results( wrapped(*_args, **_kwargs), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, _result_filter, **_kwargs): yield r # collect if summary is desired if do_custom_result_summary: results.append(r) if proc_post and cmdline_name != 'run-procedure': from datalad.interface.run_procedure import RunProcedure for procspec in proc_post: lgr.debug('Running configured post-procedure %s', procspec) for r in _process_results( RunProcedure.__call__( procspec, dataset=dataset_arg, return_type='generator'), _func_class, action_summary, on_failure, incomplete_results, result_renderer, result_xfm, result_filter, **_kwargs): yield r # result summary before a potential exception # custom first if do_custom_result_summary: _func_class.custom_result_summary_renderer(results) elif result_renderer == 'default' and action_summary and \ sum(sum(s.values()) for s in action_summary.values()) > 1: # give a summary in default mode, when there was more than one # action performed ui.message("action summary:\n {}".format( '\n '.join('{} ({})'.format( act, ', '.join('{}: {}'.format(status, action_summary[act][status]) for status in sorted(action_summary[act]))) for act in sorted(action_summary)))) if incomplete_results: raise IncompleteResultsError( failed=incomplete_results, msg="Command did not complete successfully") if return_type == 'generator': # hand over the generator return generator_func(*args, **kwargs) else: @wrapt.decorator def return_func(wrapped_, instance_, args_, kwargs_): results = wrapped_(*args_, **kwargs_) if inspect.isgenerator(results): # unwind generator if there is one, this actually runs # any processing results = list(results) # render summaries if not result_xfm and result_renderer == 'tailored': # cannot render transformed results if hasattr(_func_class, 'custom_result_summary_renderer'): _func_class.custom_result_summary_renderer(results) if return_type == 'item-or-list' and \ len(results) < 2: return results[0] if results else None else: return results return return_func(generator_func)(*args, **kwargs)
def _get_config(self, var, default=None): from datalad import cfg return cfg.get(self.name.lower() + '.log.' + var, default)
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None): """Make a direct query of a dataset to extract its metadata. Parameters ---------- ds : Dataset types : list """ errored = False dsmeta = dict() contentmeta = {} if global_meta is not None and content_meta is not None and \ not global_meta and not content_meta: # both are false and not just none return dsmeta, contentmeta, errored context = { '@vocab': 'http://docs.datalad.org/schema_v{}.json'.format( vocabulary_version)} fullpathlist = paths if paths and isinstance(ds.repo, AnnexRepo): # Ugly? Jep: #2055 content_info = zip(paths, ds.repo.file_has_content(paths), ds.repo.is_under_annex(paths)) paths = [p for p, c, a in content_info if not a or c] nocontent = len(fullpathlist) - len(paths) if nocontent: # TODO better fail, or support incremental and label this file as no present lgr.warn( '{} files have no content present, ' 'some extractors will not operate on {}'.format( nocontent, 'them' if nocontent > 10 else [p for p, c, a in content_info if not c and a]) ) # pull out potential metadata field blacklist config settings blacklist = [re.compile(bl) for bl in assure_list(ds.config.obtain( 'datalad.metadata.aggregate-ignore-fields', default=[]))] # enforce size limits max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize') # keep local, who knows what some extractors might pull in from pkg_resources import iter_entry_points # delayed heavy import extractors = {ep.name: ep for ep in iter_entry_points('datalad.metadata.extractors')} log_progress( lgr.info, 'metadataextractors', 'Start metadata extraction from %s', ds, total=len(types), label='Metadata extraction', unit=' extractors', ) for mtype in types: mtype_key = mtype log_progress( lgr.info, 'metadataextractors', 'Engage %s metadata extractor', mtype_key, update=1, increment=True) if mtype_key not in extractors: # we said that we want to fail, rather then just moan about less metadata log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( 'Enabled metadata extractor %s is not available in this installation', mtype_key) try: extractor_cls = extractors[mtype_key].load() extractor = extractor_cls( ds, paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist) except Exception as e: log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise ValueError( "Failed to load metadata extractor for '%s', " "broken dataset configuration (%s)?: %s", mtype, ds, exc_str(e)) continue try: dsmeta_t, contentmeta_t = extractor.get_metadata( dataset=global_meta if global_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-dataset-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool()), content=content_meta if content_meta is not None else ds.config.obtain( 'datalad.metadata.aggregate-content-{}'.format(mtype.replace('_', '-')), default=True, valtype=EnsureBool())) except Exception as e: lgr.error('Failed to get dataset metadata ({}): {}'.format( mtype, exc_str(e))) if cfg.get('datalad.runtime.raiseonerror'): log_progress( lgr.error, 'metadataextractors', 'Failed %s metadata extraction from %s', mtype_key, ds, ) raise errored = True # if we dont get global metadata we do not want content metadata continue if dsmeta_t: if _ok_metadata(dsmeta_t, mtype, ds, None): dsmeta_t = _filter_metadata_fields( dsmeta_t, maxsize=max_fieldsize, blacklist=blacklist) dsmeta[mtype_key] = dsmeta_t else: errored = True unique_cm = {} extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude", set()) # TODO: ATM neuroimaging extractors all provide their own internal # log_progress but if they are all generators, we could provide generic # handling of the progress here. Note also that log message is actually # seems to be ignored and not used, only the label ;-) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Metadata extraction per location for %s', mtype, # # contentmeta_t is a generator... so no cound is known # # total=len(contentmeta_t or []), # label='Metadata extraction per location', # unit=' locations', # ) for loc, meta in contentmeta_t or {}: lgr.log(5, "Analyzing metadata for %s", loc) # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label=loc, # update=1, # increment=True) if not _ok_metadata(meta, mtype, ds, loc): errored = True # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'ignoredatm', # label='Failed for %s' % loc, # ) continue # we also want to store info that there was no metadata(e.g. to get a list of # files that have no metadata) # if there is an issue that a extractor needlessly produces empty records, the # extractor should be fixed and not a general switch. For example the datalad_core # issues empty records to document the presence of a file #elif not meta: # continue # apply filters meta = _filter_metadata_fields( meta, maxsize=max_fieldsize, blacklist=blacklist) if not meta: continue # assign # only ask each metadata extractor once, hence no conflict possible loc_dict = contentmeta.get(loc, {}) loc_dict[mtype_key] = meta contentmeta[loc] = loc_dict if ds.config.obtain( 'datalad.metadata.generate-unique-{}'.format(mtype_key.replace('_', '-')), default=True, valtype=EnsureBool()): # go through content metadata and inject report of unique keys # and values into `dsmeta` for k, v in iteritems(meta): if k in dsmeta.get(mtype_key, {}): # if the dataset already has a dedicated idea # about a key, we skip it from the unique list # the point of the list is to make missing info about # content known in the dataset, not to blindly # duplicate metadata. Example: list of samples data # were recorded from. If the dataset has such under # a 'sample' key, we should prefer that, over an # aggregated list of a hopefully-kinda-ok structure continue elif k in extractor_unique_exclude: # the extractor thinks this key is worthless for the purpose # of discovering whole datasets # we keep the key (so we know that some file is providing this key), # but ignore any value it came with unique_cm[k] = None continue vset = unique_cm.get(k, set()) vset.add(_val2hashable(v)) unique_cm[k] = vset # log_progress( # lgr.debug, # 'metadataextractors_loc', # 'Finished metadata extraction across locations for %s', mtype) if unique_cm: # per source storage here too ucp = dsmeta.get('datalad_unique_content_properties', {}) # important: we want to have a stable order regarding # the unique values (a list). we cannot guarantee the # same order of discovery, hence even when not using a # set above we would still need sorting. the callenge # is that any value can be an arbitrarily complex nested # beast # we also want to have each unique value set always come # in a top-level list, so we known if some unique value # was a list, os opposed to a list of unique values def _ensure_serializable(val): if isinstance(val, ReadOnlyDict): return {k: _ensure_serializable(v) for k, v in iteritems(val)} if isinstance(val, (tuple, list)): return [_ensure_serializable(v) for v in val] else: return val ucp[mtype_key] = { k: [_ensure_serializable(i) for i in sorted( v, key=_unique_value_key)] if v is not None else None for k, v in iteritems(unique_cm) # v == None (disable unique, but there was a value at some point) # otherwise we only want actual values, and also no single-item-lists # of a non-value # those contribute no information, but bloat the operation # (inflated number of keys, inflated storage, inflated search index, ...) if v is None or (v and not v == {''})} dsmeta['datalad_unique_content_properties'] = ucp log_progress( lgr.info, 'metadataextractors', 'Finished metadata extraction from %s', ds, ) # always identify the effective vocabulary - JSON-LD style if context: dsmeta['@context'] = context return dsmeta, contentmeta, errored
def newfunc(*args, **kwargs): from datalad import cfg test_ssh = cfg.get("datalad.tests.ssh", '') if not test_ssh or test_ssh in ('0', 'false', 'no'): raise SkipTest("Run this test by setting DATALAD_TESTS_SSH") return func(*args, **kwargs)
from datalad.support.annexrepo import AnnexRepo from datalad.utils import swallow_logs from datalad.distribution.dataset import Dataset from .utils import with_tempfile from .utils import skip_if_no_network from .utils import with_testrepos from .utils import on_windows from .utils import SkipTest if on_windows: raise SkipTest("Can't test direct mode switch, " "if direct mode is forced by OS anyway.") repo_version = cfg.get("datalad.repo.version", None) if repo_version and int(repo_version) >= 6: raise SkipTest("Can't test direct mode switch, " "if repository version 6 or later is enforced.") @with_tempfile @with_tempfile @with_tempfile @with_tempfile def test_direct_cfg(path1, path2, path3, path4): with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}): # create annex repo in direct mode: with swallow_logs(new_level=logging.DEBUG) as cml: ar = AnnexRepo(path1, create=True) cml.assert_logged("Switching to direct mode",