def test_assert_cwd_unchanged_not_masking_exceptions(): # Test that we are not masking out other "more important" exceptions orig_cwd = os.getcwd() @assert_cwd_unchanged def do_chdir_value_error(): os.chdir(os.pardir) raise ValueError("error exception") with swallow_logs(new_level=logging.WARN) as cml: with assert_raises(ValueError) as cm: do_chdir_value_error() # retrospect exception eq_(orig_cwd, os.getcwd(), "assert_cwd_unchanged didn't return us back to %s" % orig_cwd) assert_in("Mitigating and changing back", cml.out) # and again but allowing to chdir @assert_cwd_unchanged(ok_to_chdir=True) def do_chdir_value_error(): os.chdir(os.pardir) raise ValueError("error exception") with swallow_logs(new_level=logging.WARN) as cml: assert_raises(ValueError, do_chdir_value_error) eq_(orig_cwd, os.getcwd(), "assert_cwd_unchanged didn't return us back to %s" % orig_cwd) assert_not_in("Mitigating and changing back", cml.out)
def test_gitrepo_call_git_methods(path=None): gr = GitRepo(path).init() gr.call_git(['add', "foo", "bar"]) gr.call_git(['commit', '-m', "foobar"]) gr.call_git(["mv"], files=["foo", "foo.txt"]) ok_((gr.pathobj / 'foo.txt').exists()) for expect_fail, check in [(False, assert_in), (True, assert_not_in)]: with swallow_logs(new_level=logging.DEBUG) as cml: with assert_raises(CommandError): gr.call_git(["mv"], files=["notthere", "dest"], expect_fail=expect_fail) check("fatal: bad source", cml.out) eq_(list(gr.call_git_items_(["ls-files"], read_only=True)), ["bar", "foo.txt"]) eq_(list(gr.call_git_items_(["ls-files", "-z"], sep="\0", read_only=True)), # Note: The custom separator has trailing empty item, but this is an # arbitrary command with unknown output it isn't safe to trim it. ["bar", "foo.txt"]) with assert_raises(AssertionError): gr.call_git_oneline(["ls-files"], read_only=True) eq_(gr.call_git_oneline(["ls-files"], files=["bar"], read_only=True), "bar") ok_(gr.call_git_success(["rev-parse", "HEAD^{commit}"], read_only=True)) with swallow_logs(new_level=logging.DEBUG) as cml: assert_false(gr.call_git_success(["rev-parse", "HEAD^{blob}"], read_only=True)) assert_not_in("expected blob type", cml.out)
def test_windows_incompatible_names(path=None): ds = Dataset(path).create() create_tree( path, { 'imgood': 'Look what a nice name I have', 'illegal:character.txt': 'strange choice of name', 'spaceending ': 'who does these things?', 'lookmumadot.': 'why would you do this?', 'COM1.txt': 'I am a serial port', 'dirs with spaces': { 'seriously?': 'you are stupid', 'why somuch?wrongstuff.': "I gave up" }, }) ds.repo.config.set('datalad.save.windows-compat-warning', 'error') ds.save('.datalad/config') res = ds.save(on_failure='ignore') # check that none of the 6 problematic files was saved, but the good one was assert_result_count(res, 6, status='impossible', action='save') assert_result_count(res, 1, status='ok', action='save') # check that the warning is emitted ds.repo.config.set('datalad.save.windows-compat-warning', 'warning') ds.save('.datalad/config') with swallow_logs(new_level=logging.WARN) as cml: ds.save() cml.assert_logged( "Some elements of your dataset are not compatible with Windows " "systems. Disable this check by changing " "datalad.save.windows-compat-warning or consider renaming the " "following elements:") assert_in("Elements using a reserved filename:", cml.out) assert_in("Elements with illegal characters:", cml.out) assert_in("Elements ending with a dot:", cml.out) assert_in("Elements ending with a space:", cml.out) # check that a setting of 'none' really does nothing ds.repo.config.set('datalad.save.windows-compat-warning', 'none') ds.save('.datalad/config') create_tree( path, { 'more illegal:characters?.py': 'My arch nemesis uses Windows and I will' 'destroy them! Muahahaha' }) with swallow_logs(new_level=logging.WARN) as cml: res = ds.save() # we shouldn't see warnings assert_not_in( "Some elements of your dataset are not compatible with Windows " "systems. Disable this check by changing " "datalad.save.windows-compat-warning or consider renaming the " "following elements:", cml.out) # make sure the file is saved successfully assert_result_count(res, 1, status='ok', action='save')
def test_addurls_dry_run(path=None): ds = Dataset(path).create(force=True) json_file = "links.json" with open(op.join(ds.path, json_file), "w") as jfh: json.dump([{ "url": "URL/a.dat", "name": "a", "subdir": "foo" }, { "url": "URL/b.dat", "name": "b", "subdir": "bar" }, { "url": "URL/c.dat", "name": "c", "subdir": "foo" }], jfh) ds.save(message="setup") with swallow_logs(new_level=logging.INFO) as cml: ds.addurls(json_file, "{url}", "{subdir}//{_url_filename_root}", dry_run=True, result_renderer='disabled') for dir_ in ["foo", "bar"]: assert_in("Would create a subdataset at {}".format(dir_), cml.out) assert_in( "Would download URL/a.dat to {}".format( os.path.join(path, "foo", "BASE")), cml.out) assert_in("Metadata: {}".format([u"name=a", u"subdir=foo"]), cml.out)
def test_url_base(): # Basic checks assert_raises(ValueError, URL, "http://example.com", hostname='example.com') url = URL("http://example.com") eq_(url.hostname, 'example.com') eq_(url.scheme, 'http') eq_(url.port, '') # not specified -- empty strings eq_(url.username, '') # not specified -- empty strings eq_(repr(url), "URL(hostname='example.com', scheme='http')") eq_(url, "http://example.com") # automagic coercion in __eq__ neq_(URL(), URL(hostname='x')) smth = URL('smth') eq_(smth.hostname, '') ok_(bool(smth)) nok_(bool(URL())) assert_raises(ValueError, url._set_from_fields, unknown='1') with swallow_logs(new_level=logging.WARNING) as cml: # we don't "care" about params ATM so there is a warning if there are any purl = URL("http://example.com/;param") eq_(str(purl), 'http://example.com/;param') # but we do maintain original string assert_in('ParseResults contains params', cml.out) eq_(purl.as_str(), 'http://example.com/')
def test_addurls_version(self=None, path=None): ds = Dataset(path).create(force=True) def version_fn(url): if url.endswith("b.dat"): raise ValueError("Scheme error") return url + ".v1" with patch("datalad.local.addurls.get_versioned_url", version_fn): with swallow_logs(new_level=logging.WARNING) as cml: ds.addurls(self.json_file, "{url}", "{name}", version_urls=True, result_renderer='disabled') assert_in("b.dat", str(cml.out)) names = ["a", "c"] for fname in names: ok_exists(os.path.join(path, fname)) whereis = ds.repo.whereis(names, output="full") for fname, info in whereis.items(): eq_(info[WEB_SPECIAL_REMOTE_UUID]['urls'], ["{}udir/{}.dat.v1".format(self.url, fname)])
def test_install_skip_failed_recursive(src=None, path=None): _mk_submodule_annex(src, fname="test-annex.dat", fcontent="whatever") # install top level: ds = install(path, source=src) sub1 = Dataset(opj(path, 'subm 1')) sub2 = Dataset(opj(path, '2')) # sabotage recursive installation of 'subm 1' by polluting the target: with open(opj(path, 'subm 1', 'blocking.txt'), "w") as f: f.write("sdfdsf") with swallow_logs(new_level=logging.WARNING) as cml: result = ds.get(os.curdir, recursive=True, on_failure='ignore', result_xfm=None) # toplevel dataset was in the house already assert_result_count(result, 0, path=ds.path, type='dataset') # subm 1 should fail to install. [1] since comes after '2' submodule assert_in_results( result, status='error', path=sub1.path, type='dataset', message='target path already exists and not empty, refuse to ' 'clone into target path') assert_in_results(result, status='ok', path=sub2.path)
def test_too_long(): with swallow_logs(new_level=logging.ERROR) as cml: with assert_raises(OSError): # we still raise an exception if we exceed too much Runner().run( [sys.executable, '-c', 'import sys; print(len(sys.argv))'] + [str(i) for i in range(CMD_MAX_ARG)], protocol=StdOutCapture ) cml.assert_logged('.*use.*ulimit.*')
def test_addurls_dropped_urls(self=None, path=None): ds = Dataset(path).create(force=True) with swallow_logs(new_level=logging.WARNING) as cml: ds.addurls(self.json_file, "", "{subdir}//{name}", result_renderer='disabled') assert_re_in(r".*Dropped [0-9]+ row\(s\) that had an empty URL", str(cml.out))
def test_guess_dot_git(path=None, url=None, tdir=None, *, annex): repo = (AnnexRepo if annex else GitRepo)(path, create=True) repo.add('file.txt', git=not annex) repo.commit() # we need to prepare to be served via http, otherwise it must fail with swallow_logs() as cml: assert_raises(IncompleteResultsError, install, path=tdir, source=url) ok_(not exists(tdir)) Runner(cwd=path).run(['git', 'update-server-info']) with swallow_logs() as cml: installed = install(tdir, source=url) assert_not_in("Failed to get annex.uuid", cml.out) eq_(installed.pathobj.resolve(), Path(tdir).resolve()) ok_(exists(tdir)) assert_repo_status(tdir, annex=annex)
def test_external_versions_rogue_module(topd=None): ev = ExternalVersions() # if module throws some other non-ImportError exception upon import # we must not crash, but issue a warning modname = 'verycustomrogue__' create_tree(topd, {modname + '.py': 'raise Exception("pickaboo")'}) with patch('sys.path', [topd]), \ swallow_logs(new_level=logging.WARNING) as cml: assert ev[modname] is None assert_true(ev.dumps(indent=True).endswith(linesep)) assert_in('pickaboo', cml.out)
def test_addurls_subdataset(self=None, path=None): ds = Dataset(path).create(force=True) for save in True, False: label = "save" if save else "nosave" with swallow_outputs() as cmo: ds.addurls(self.json_file, "{url}", "{subdir}-" + label + "//{name}", save=save, cfg_proc=["yoda"]) # The custom result renderer transforms the subdataset # action=create results into something more informative than # "create(ok): . (dataset)"... assert_in("create(ok): foo-{} (dataset)".format(label), cmo.out) # ... and that doesn't lose the standard summary. assert_in("create (ok: 2)", cmo.out) subdirs = [ op.join(ds.path, "{}-{}".format(d, label)) for d in ["foo", "bar"] ] subdir_files = dict(zip(subdirs, [["a", "c"], ["b"]])) for subds, fnames in subdir_files.items(): for fname in fnames: ok_exists(op.join(subds, fname)) # cfg_proc was applied generated subdatasets. ok_exists(op.join(subds, "code")) if save: assert_repo_status(path) else: # The datasets are create but not saved (since asked not to) assert_repo_status(path, untracked=subdirs) # but the downloaded files aren't. for subds, fnames in subdir_files.items(): assert_repo_status(subds, added=fnames) # Now save the "--nosave" changes and check that we have # all the subdatasets. ds.save() eq_( set(subdatasets(dataset=ds, recursive=True, result_xfm="relpaths")), {"foo-save", "bar-save", "foo-nosave", "bar-nosave"}) # We don't try to recreate existing subdatasets. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}", result_renderer='disabled') assert_in("Not creating subdataset at existing path", cml.out)
def test_addurls_no_rows(self=None, path=None): ds = Dataset(path).create(force=True) for fname in ["in.csv", "in.tsv", "in.json"]: with swallow_logs(new_level=logging.WARNING) as cml: assert_in_results(ds.addurls(fname, "{url}", "{name}", result_renderer='disabled'), action="addurls", status="notneeded") cml.assert_logged("No rows", regex=False)
def check_filters(name): with swallow_logs(new_level=logging.DEBUG, name=name) as cml: lgr1 = logging.getLogger(name + '.goodone') lgr2 = logging.getLogger(name + '.anotherone') lgr3 = logging.getLogger(name + '.bad') lgr1.debug('log1') lgr2.info('log2') lgr3.info('log3') assert_in('log1', cml.out) assert_in('log2', cml.out) assert_not_in('log3', cml.out)
def test_install_dataladri(src=None, topurl=None, path=None): # make plain git repo ds_path = opj(src, 'ds') gr = GitRepo(ds_path, create=True) gr.add('test.txt') gr.commit('demo') Runner(cwd=gr.path).run(['git', 'update-server-info']) # now install it somewhere else with patch('datalad.consts.DATASETS_TOPURL', topurl), \ swallow_logs(): ds = install(path, source='///ds') eq_(ds.path, path) assert_repo_status(path, annex=False) ok_file_has_content(opj(path, 'test.txt'), 'some')
def test_log_progress_noninteractive_filter(): name = "dl-test" lgr = LoggerHelper(name).get_initialized_logger() pbar_id = "lp_test" with swallow_logs(new_level=logging.INFO, name=name) as cml: log_progress(lgr.info, pbar_id, "Start", label="testing", total=3) log_progress(lgr.info, pbar_id, "THERE0", update=1) log_progress(lgr.info, pbar_id, "NOT", update=1, noninteractive_level=logging.DEBUG) log_progress(lgr.info, pbar_id, "THERE1", update=1, noninteractive_level=logging.INFO) log_progress(lgr.info, pbar_id, "Done") for present in ["Start", "THERE0", "THERE1", "Done"]: assert_in(present, cml.out) assert_not_in("NOT", cml.out)
def test_push_git_annex_branch_many_paths_same_data(path=None): path = Path(path) ds = Dataset(path / "ds").create(force=True) ds.save() mk_push_target(ds, "target", str(path / "target"), annex=True, bare=False) nbytes = sum( ds.repo.get_content_annexinfo(paths=[f])[f]["bytesize"] for f in [ ds.repo.pathobj / "f0", ds.repo.pathobj / "f3", ds.repo.pathobj / "f4" ]) with swallow_logs(new_level=logging.DEBUG) as cml: res = ds.push(to="target") assert_in("{} bytes of annex data".format(nbytes), cml.out) # 3 files point to content already covered by another file. assert_result_count(res, 3, action="copy", type="file", status="notneeded")
def check_basic_scenario(url, d=None): ds = Dataset(d).create() annex = ds.repo # TODO skip if no boto or no credentials get_test_providers(url) # so to skip if unknown creds # Let's try to add some file which we should have access to ds.download_url(url) ds.save() # git-annex got a fix where it stopped replacing - in the middle of the filename # Let's cater to the developers who might have some intermediate version and not # easy to compare -- we will just check that only one file there is an that it # matches what we expect when outside of the development versions range: filenames = glob.glob(op.join(d, '3versions[-_]allversioned.txt')) eq_(len(filenames), 1) filename = op.basename(filenames[0]) if external_versions['cmd:annex'] < '8.20200501': assert_in('_', filename) # Date after the fix in 8.20200501-53-gcabbc91b1 elif external_versions['cmd:annex'] >= '8.20200512': assert_in('-', filename) else: pass # either of those is ok whereis1 = annex.whereis(filename, output='full') eq_(len(whereis1), 2) # here and datalad annex.drop(filename) whereis2 = annex.whereis(filename, output='full') eq_(len(whereis2), 1) # datalad # make sure that there are no "hidden" error messages, despite the # whereis command succeeding # https://github.com/datalad/datalad/issues/6453#issuecomment-1047533276 from datalad.runner import StdOutErrCapture # we need to swallow logs since if DATALAD_LOG_LEVEL is set low, we # would get all the git-annex debug output in stderr with swallow_logs(new_level=logging.INFO) as cml: out = annex._call_annex(['whereis'], protocol=StdOutErrCapture) eq_(out['stderr'].strip(), '') # if we provide some bogus address which we can't access, we shouldn't pollute output with assert_raises(CommandError) as cme: annex.add_url_to_file('bogus', url + '_bogus') assert_in('addurl: 1 failed', cme.value.stderr)
def test_assert_Xwd_unchanged_ok_chdir(func): # Test that we are not masking out other "more important" exceptions orig_cwd = os.getcwd() orig_pwd = getpwd() @assert_cwd_unchanged(ok_to_chdir=True) def do_chdir_value_error(): func(os.pardir) return "a value" with swallow_logs() as cml: eq_(do_chdir_value_error(), "a value") eq_(orig_cwd, os.getcwd(), "assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd) eq_(orig_pwd, getpwd(), "assert_cwd_unchanged didn't return us back to cwd %s" % orig_pwd) assert_not_in("Mitigating and changing back", cml.out)
def test_is_url(): ok_(is_url('file://localhost/some')) ok_(is_url('http://localhost')) ok_(is_url('ssh://me@localhost')) # in current understanding it is indeed a url but an 'ssh', implicit=True, not just # a useless scheme=weird with a hope to point to a netloc with swallow_logs(): ok_(is_url('weird://')) nok_(is_url('relative')) nok_(is_url('/absolute')) ok_(is_url('like@sshlogin')) # actually we do allow ssh:implicit urls ATM nok_(is_url('')) nok_(is_url(' ')) nok_(is_url(123)) # stuff of other types wouldn't be considered a URL # we can pass RI instance directly ok_(is_url(RI('file://localhost/some'))) nok_(is_url(RI('relative')))
def test_ssh_custom_identity_file(): ifile = "/tmp/dl-test-ssh-id" # Travis if not op.exists(ifile): raise SkipTest( "Travis-specific '{}' identity file does not exist".format(ifile)) with patch_config({"datalad.ssh.identityfile": ifile}): with swallow_logs(new_level=logging.DEBUG) as cml: manager = SSHManager() ssh = manager.get_connection('ssh://datalad-test') cmd_out, _ = ssh("echo blah") if _ssh_manager_is_multiplex: expected_socket = op.join( str(manager.socket_dir), get_connection_hash("datalad-test", identity_file=ifile)) ok_(exists(expected_socket)) manager.close() assert_in("-i", cml.out) assert_in(ifile, cml.out)
def test_create_alias(ds_path=None, ria_path=None, clone_path=None): ds_path = Path(ds_path) clone_path = Path(clone_path) ds_path.mkdir() dsa = Dataset(ds_path / "a").create() res = dsa.create_sibling_ria(url="ria+file://{}".format(ria_path), name="origin", alias="ds-a", new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') ds_clone = clone(source="ria+file://{}#~ds-a".format(ria_path), path=clone_path / "a") assert_repo_status(ds_clone.path) # multiple datasets in a RIA store with different aliases work dsb = Dataset(ds_path / "b").create() res = dsb.create_sibling_ria(url="ria+file://{}".format(ria_path), name="origin", alias="ds-b", new_store_ok=True) assert_result_count(res, 1, status='ok', action='create-sibling-ria') ds_clone = clone(source="ria+file://{}#~ds-b".format(ria_path), path=clone_path / "b") assert_repo_status(ds_clone.path) # second dataset in a RIA store with the same alias emits a warning dsc = Dataset(ds_path / "c").create() with swallow_logs(logging.WARNING) as cml: res = dsc.create_sibling_ria(url="ria+file://{}".format(ria_path), name="origin", alias="ds-a", new_store_ok=True) assert_in( "Alias 'ds-a' already exists in the RIA store, not adding an alias", cml.out) assert_result_count(res, 1, status='ok', action='create-sibling-ria')
def _check_ri(ri, cls, exact_str=True, localpath=None, **fields): """just a helper to carry out few checks on urls""" with swallow_logs(new_level=logging.DEBUG) as cml: ri_ = cls(**fields) murl = RI(ri) eq_(murl.__class__, cls) # not just a subclass eq_(murl, ri_) if isinstance(ri, str): eq_(str(RI(ri)), ri) eq_(eval(repr(ri_)), ri) # repr leads back to identical ri_ eq_(ri, ri_) # just in case ;) above should fail first if smth is wrong if not exact_str: assert_in('Parsed version of', cml.out) (eq_ if exact_str else neq_)( str(ri), str(ri_)) # that we can reconstruct it EXACTLY on our examples # and that we have access to all those fields nok_(set(fields).difference(set(cls._FIELDS))) for f, v in fields.items(): eq_(getattr(ri_, f), v) if localpath: eq_(ri_.localpath, localpath) old_localpath = ri_.localpath # for a test below else: # if not given -- must be a remote url, should raise exception with assert_raises(ValueError): ri_.localpath # This one does not have a path. TODO: either proxy path from its .RI or adjust # hierarchy of classes to make it more explicit if cls == GitTransportRI: return # do changes in the path persist? old_str = str(ri_) ri_.path = newpath = opj(ri_.path, 'sub') eq_(ri_.path, newpath) neq_(str(ri_), old_str) if localpath: eq_(ri_.localpath, opj(old_localpath, 'sub'))
def test_rerun_invalid_merge_run_commit(path=None): ds = Dataset(path).create() ds.run("echo foo >>foo") ds.run("echo invalid >>invalid") run_msg = last_commit_msg(ds.repo) run_hexsha = ds.repo.get_hexsha() ds.repo.call_git(["reset", "--hard", DEFAULT_BRANCH + "~"]) with open(op.join(ds.path, "non-run"), "w") as nrfh: nrfh.write("non-run") ds.save() # Assign two parents to the invalid run commit. commit = ds.repo.call_git_oneline( ["commit-tree", run_hexsha + "^{tree}", "-m", run_msg, "-p", run_hexsha + "^", "-p", ds.repo.get_hexsha()]) ds.repo.call_git(["reset", "--hard", commit]) hexsha_orig = ds.repo.get_hexsha() with swallow_logs(new_level=logging.WARN) as cml: ds.rerun(since="") assert_in("has run information but is a merge commit", cml.out) eq_(len(ds.repo.get_revisions(hexsha_orig + ".." + DEFAULT_BRANCH)), 1)
def test_ssh_manager_close_no_throw(bogus_socket=None): manager = MultiplexSSHManager() class bogus: def close(self): raise Exception("oh I am so bad") @property def ctrl_path(self): with open(bogus_socket, "w") as f: f.write("whatever") return Path(bogus_socket) # since we are digging into protected area - should also set _prev_connections manager._prev_connections = {} manager._connections['bogus'] = bogus() assert_raises(Exception, manager.close) assert_raises(Exception, manager.close) # but should proceed just fine if allow_fail=False with swallow_logs(new_level=logging.DEBUG) as cml: manager.close(allow_fail=False) assert_in('Failed to close a connection: oh I am so bad', cml.out)
def test_git_config_warning(path=None): if 'GIT_AUTHOR_NAME' in os.environ: raise SkipTest("Found existing explicit identity config") # Note: An easier way to test this, would be to just set GIT_CONFIG_GLOBAL # to point somewhere else. However, this is not supported by git before # 2.32. Hence, stick with changed HOME in this test, but be sure to unset a # possible GIT_CONFIG_GLOBAL in addition. patched_env = os.environ.copy() patched_env.pop('GIT_CONFIG_GLOBAL', None) patched_env.update(get_home_envvars(path)) with chpwd(path), \ patch.dict('os.environ', patched_env, clear=True), \ swallow_logs(new_level=30) as cml: # no configs in that empty HOME from datalad.api import Dataset from datalad.config import ConfigManager # reach into the class and disable the "checked" flag that # has already been tripped before we get here ConfigManager._checked_git_identity = False Dataset(path).config.reload() assert_in("configure Git before", cml.out)
def test_globbedpaths(path=None): dotdir = op.curdir + op.sep for patterns, expected in [ (["1.txt", "2.dat"], {"1.txt", "2.dat"}), ([dotdir + "1.txt", "2.dat"], {dotdir + "1.txt", "2.dat"}), (["*.txt", "*.dat"], {"1.txt", "2.dat", bOBSCURE_FILENAME, "3.txt"}), ([dotdir + "*.txt", "*.dat"], {dotdir + "1.txt", "2.dat", bOBSCURE_FILENAME, dotdir + "3.txt"}), ([op.join("subdir", "*.txt")], {op.join("subdir", "1.txt"), op.join("subdir", "2.txt")}), (["subdir" + op.sep], {"subdir" + op.sep}), ([dotdir + op.join("subdir", "*.txt")], { dotdir + op.join(*ps) for ps in [("subdir", "1.txt"), ("subdir", "2.txt")] }), (["*.txt"], {"1.txt", "3.txt"}), ([op.join("subdir", "**")], { op.join(*ps) for ps in [("subdir" + op.sep, ), ( "subdir", "subsub"), ("subdir", "1.txt"), ("subdir", "2.txt"), ("subdir", "subsub", "3.dat")] }), ([dotdir + op.join("**", "*.dat")], { dotdir + op.join("2.dat"), dotdir + bOBSCURE_FILENAME, dotdir + op.join("subdir", "subsub", "3.dat") }) ]: gp = GlobbedPaths(patterns, pwd=path) eq_(set(gp.expand()), expected) eq_(set(gp.expand(full=True)), {op.join(path, p) for p in expected}) pardir = op.pardir + op.sep subdir_path = op.join(path, "subdir") for patterns, expected in [ (["*.txt"], {"1.txt", "2.txt"}), ([dotdir + "*.txt"], {dotdir + p for p in ["1.txt", "2.txt"]}), ([pardir + "*.txt"], {pardir + p for p in ["1.txt", "3.txt"]}), ([dotdir + pardir + "*.txt"], {dotdir + pardir + p for p in ["1.txt", "3.txt"]}), # Patterns that don't match are retained by default. (["amiss"], {"amiss"}) ]: gp = GlobbedPaths(patterns, pwd=subdir_path) eq_(set(gp.expand()), expected) eq_(set(gp.expand(full=True)), {op.join(subdir_path, p) for p in expected}) # Full patterns still get returned as relative to pwd. gp = GlobbedPaths([op.join(path, "*.dat")], pwd=path) eq_(gp.expand(), ["2.dat", bOBSCURE_FILENAME]) # "." gets special treatment. gp = GlobbedPaths([".", "*.dat"], pwd=path) eq_(set(gp.expand()), {"2.dat", bOBSCURE_FILENAME, "."}) eq_(gp.expand(dot=False), ["2.dat", bOBSCURE_FILENAME]) gp = GlobbedPaths(["."], pwd=path, expand=False) eq_(gp.expand(), ["."]) eq_(gp.paths, ["."]) # We can the glob outputs. glob_results = {"z": "z", "a": ["x", "d", "b"]} with patch('glob.glob', lambda k, **kwargs: glob_results[k]): gp = GlobbedPaths(["z", "a"]) eq_(gp.expand(), ["z", "b", "d", "x"]) # glob expansion for paths property is determined by expand argument. for expand, expected in [(True, ["2.dat", bOBSCURE_FILENAME]), (False, ["*.dat"])]: gp = GlobbedPaths(["*.dat"], pwd=path, expand=expand) eq_(gp.paths, expected) with swallow_logs(new_level=logging.DEBUG) as cml: GlobbedPaths(["not here"], pwd=path).expand() assert_in("No matching files found for 'not here'", cml.out)
def test_Dataset_flyweight(path1=None, path2=None): import gc import sys ds1 = Dataset(path1) assert_is_instance(ds1, Dataset) # Don't create circular references or anything similar assert_equal(1, sys.getrefcount(ds1) - 1) ds1.create() # Due to issue 4862, we currently still require gc.collect() under unclear # circumstances to get rid of an exception traceback when creating in an # existing directory. That traceback references the respective function # frames which in turn reference the repo instance (they are methods). # Doesn't happen on all systems, though. Eventually we need to figure that # out. # However, still test for the refcount after gc.collect() to ensure we don't # introduce new circular references and make the issue worse! gc.collect() # refcount still fine after repo creation: assert_equal(1, sys.getrefcount(ds1) - 1) # instantiate again: ds2 = Dataset(path1) assert_is_instance(ds2, Dataset) # the very same object: ok_(ds1 is ds2) # reference the same via relative path: with chpwd(path1): ds3 = Dataset(relpath(path1, start=path2)) ok_(ds1 == ds3) ok_(ds1 is ds3) # gc knows one such object only: eq_( 1, len([ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ])) # on windows a symlink is not what you think it is if not on_windows: # reference the same via symlink: with chpwd(path2): os.symlink(path1, 'linked') ds4 = Dataset('linked') ds4_id = id(ds4) ok_(ds4 == ds1) ok_(ds4 is not ds1) # underlying repo, however, IS the same: ok_(ds4.repo is ds1.repo) # deleting one reference has no effect on the other: del ds1 gc.collect() # TODO: see first comment above ok_(ds2 is not None) ok_(ds2.repo is ds3.repo) if not on_windows: ok_(ds2.repo is ds4.repo) # deleting remaining references should lead to garbage collection del ds2 with swallow_logs(new_level=1) as cml: del ds3 gc.collect() # TODO: see first comment above # flyweight vanished: assert_not_in(path1, Dataset._unique_instances.keys()) # no such instance known to gc anymore: eq_([], [ o for o in gc.get_objects() if isinstance(o, Dataset) and o.path == path1 ]) # underlying repo should only be cleaned up, if ds3 was the last # reference to it. Otherwise the repo instance should live on # (via symlinked ds4): finalizer_log = "Finalizer called on: AnnexRepo(%s)" % path1 if on_windows: cml.assert_logged(msg=finalizer_log, level="Level 1", regex=False) else: assert_not_in(finalizer_log, cml.out) # symlinked is still there: ok_(ds4 is not None) eq_(ds4_id, id(ds4))
def test_aggregation(path=None): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', scope='branch') ds.save(recursive=True) assert_repo_status(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_in_results(res, action='save', status="ok") # nice and tidy assert_repo_status(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == ensure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extract same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_addurls(self=None, path=None): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return len(ds.repo.get_revisions("git-annex")) n_annex_commits = get_annex_commit_counts() # Meanwhile also test that we can specify path relative # to the top of the dataset, as we generally treat paths in # Python API, and it will be the one saved in commit # message record json_file = op.relpath(self.json_file, ds.path) ds.addurls(json_file, "{url}", "{name}", exclude_autometa="(md5sum|size)", result_renderer='disabled') ok_startswith(ds.repo.format_commit('%b', DEFAULT_BRANCH), f"url_file='{json_file}'") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(op.join(ds.path, fname)) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. # Also ignore if on Windows as it seems as if a git-annex bug # leads to separate meta data commits: # https://github.com/datalad/datalad/pull/5202#discussion_r535429704 if not (dl_cfg.get('datalad.fake-dates') or on_windows): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite", result_renderer='disabled') for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip", result_renderer='disabled'), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}", result_renderer='disabled') # But it fails if something has changed. ds.unlock("a") with open(op.join(ds.path, "a"), "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}", result_renderer='disabled')