def test_basic_aggregate(path=None): # TODO give datasets some more metadata to actually aggregate stuff base = Dataset(opj(path, 'origin')).create(force=True) sub = base.create('sub', force=True) #base.metadata(sub.path, init=dict(homepage='this'), apply2global=True) subsub = base.create(opj('sub', 'subsub'), force=True) base.save(recursive=True) assert_repo_status(base.path) # we will first aggregate the middle dataset on its own, this will # serve as a smoke test for the reuse of metadata objects later on sub.aggregate_metadata() base.save() assert_repo_status(base.path) base.aggregate_metadata(recursive=True, update_mode='all') assert_repo_status(base.path) direct_meta = base.metadata(recursive=True, return_type='list') # loose the deepest dataset sub.drop('subsub', what='all', reckless='kill', recursive=True) # no we should eb able to reaggregate metadata, and loose nothing # because we can aggregate aggregated metadata of subsub from sub base.aggregate_metadata(recursive=True, update_mode='all') # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): print(d['path'], a['path']) assert_dict_equal(d, a) # no we can throw away the subdataset tree, and loose no metadata base.drop('sub', what='all', reckless='kill', recursive=True) assert (not sub.is_installed()) assert_repo_status(base.path) # same result for aggregate query than for (saved) direct query agg_meta = base.metadata(recursive=True, return_type='list') for d, a in zip(direct_meta, agg_meta): assert_dict_equal(d, a)
def test_result_filter(): # ensure baseline without filtering assert_equal([r['somekey'] for r in TestUtils().__call__(4)], [0, 1, 2, 3]) # test two functionally equivalent ways to filter results # 1. Constraint-based -- filter by exception # we have a full set of AND and OR operators for this # 2. custom filer function -- filter by boolean return value for filt in (EnsureKeyChoice('somekey', (0, 2)), lambda x: x['somekey'] in (0, 2)): assert_equal([ r['somekey'] for r in TestUtils().__call__(4, result_filter=filt) ], [0, 2]) # constraint returns full dict assert_dict_equal(TestUtils().__call__(4, result_filter=filt)[-1], { 'action': 'off', 'path': 'some', 'status': 'ok', 'somekey': 2 }) # test more sophisticated filters that actually get to see the # API call's kwargs def greatfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), 'awesome') return True TestUtils().__call__(4, dataset='awesome', result_filter=greatfilter) def sadfilter(res, **kwargs): assert_equal(kwargs.get('dataset', 'bob'), None) return True TestUtils().__call__(4, result_filter=sadfilter)
def test_get_file_parts(): assert_dict_equal( au.get_file_parts("file.tar.gz", "prefix"), { "prefix": "file.tar.gz", "prefix_root_py": "file.tar", "prefix_ext_py": ".gz", "prefix_root": "file", "prefix_ext": ".tar.gz" })
def test_extract(): info, subpaths = au.extract( ST_DATA["rows"], url_format="{name}_{debut_season}.com", filename_format="{age_group}//{now_dead}//{name}.csv") eq_(subpaths, [ "adult", "kid", op.join("adult", "no"), op.join("adult", "yes"), op.join("kid", "no") ]) eq_([d["url"] for d in info], ["will_1.com", "bob_2.com", "scott_1.com", "max_2.com"]) eq_([d["filename"] for d in info], [ op.join("kid", "no", "will.csv"), op.join("adult", "yes", "bob.csv"), op.join("adult", "no", "scott.csv"), op.join("kid", "no", "max.csv") ]) expects = [{ "name": "will", "age_group": "kid", "debut_season": "1", "now_dead": "no" }, { "name": "bob", "age_group": "adult", "debut_season": "2", "now_dead": "yes" }, { "name": "scott", "age_group": "adult", "debut_season": "1", "now_dead": "no" }, { "name": "max", "age_group": "kid", "debut_season": "2", "now_dead": "no" }] for d, expect in zip(info, expects): assert_dict_equal(d["meta_args"], expect) eq_([d["subpath"] for d in info], [ op.join("kid", "no"), op.join("adult", "yes"), op.join("adult", "no"), op.join("kid", "no") ])
def test_assert_dict_equal(): assert_dict_equal({}, {}) assert_dict_equal({"a": 3}, {"a": 3}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {1: 4}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 3}) assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 'a'}) try: import numpy as np except: # pragma: no cover pytest.skip("need numpy for this tiny one") # one is scalar another one array assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(1)}) assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(3)})
def test_get_url_parts(): eq_(au.get_url_parts(""), {}) assert_dict_equal(au.get_url_parts("http://datalad.org"), {"_url_hostname": "datalad.org"}) assert_dict_equal( au.get_url_parts("http://datalad.org/about.html"), { "_url_hostname": "datalad.org", "_url0": "about.html", "_url_basename": "about.html", "_url_basename_root_py": "about", "_url_basename_ext_py": ".html", "_url_basename_root": "about", "_url_basename_ext": ".html" }) assert_dict_equal(au.get_url_parts("http://datalad.org/about.html"), au.get_url_parts("http://datalad.org//about.html")) assert_dict_equal( au.get_url_parts("http://datalad.org/for/git-users"), { "_url_hostname": "datalad.org", "_url0": "for", "_url1": "git-users", "_url_basename": "git-users", "_url_basename_root_py": "git-users", "_url_basename_ext_py": "", "_url_basename_root": "git-users", "_url_basename_ext": "" })
def _compare_metadata_helper(origres, compds): for ores in origres: rpath = relpath(ores['path'], ores['refds']) cres = compds.metadata( rpath, reporton='{}s'.format(ores['type'])) if ores['type'] == 'file': # TODO implement file based lookup continue assert_result_count(cres, 1) cres = cres[0] assert_dict_equal(ores['metadata'], cres['metadata']) if ores['type'] == 'dataset': for i in ('dsid', ): eq_(ores[i], cres[i])
def test_compare_content_info(path=None): # TODO remove when `create` is RF to return the new Dataset ds = Dataset(path).create() assert_repo_status(path) # for a clean repo HEAD and worktree query should yield identical results # minus a 'bytesize' report that is readily available for HEAD, but would # not a stat call per file for the worktree, and is not done ATM wt = ds.repo.get_content_info(ref=None) assert_dict_equal( wt, { f: {k: v for k, v in p.items() if k != 'bytesize'} for f, p in ds.repo.get_content_info(ref='HEAD').items() })
def test_check_dates(path=None): skip_if_no_module("dateutil") ref_ts = 1218182889 # Fri, 08 Aug 2008 04:08:09 -0400 refdate = "@{}".format(ref_ts) repo = os.path.join(path, "repo") with set_date(ref_ts + 5000): ar = AnnexRepo(repo) ar.add(".") ar.commit() # The standard renderer outputs json. with swallow_outputs() as cmo: # Set level to WARNING to avoid the progress bar when # DATALAD_TESTS_UI_BACKEND=console. with swallow_logs(new_level=logging.WARNING): check_dates([repo], reference_date=refdate, return_type="list") assert_in("report", json.loads(cmo.out).keys()) # We find the newer objects. newer = call([path], reference_date=refdate) eq_(len(newer), 1) ok_(newer[0]["report"]["objects"]) # There are no older objects to find. older = call([repo], reference_date=refdate, older=True) assert_false(older[0]["report"]["objects"]) # We can pass the date in RFC 2822 format. assert_dict_equal( newer[0], call([path], reference_date="08 Aug 2008 04:08:09 -0400")[0]) # paths=None defaults to the current directory. with chpwd(path): assert_dict_equal( newer[0]["report"], call(paths=None, reference_date=refdate)[0]["report"]) # Only commit type is present when annex='none'. newer_noannex = call([path], reference_date=refdate, annex="none") for entry in newer_noannex[0]["report"]["objects"].values(): ok_(entry["type"] == "commit")
def test_rerun_commit_message_check(): assert_raises(ValueError, get_run_info, None, """\ [DATALAD RUNCMD] no command === Do not change lines below === { "pwd": ".", "exit": 0 } ^^^ Do not change lines above ^^^""") assert_raises(ValueError, get_run_info, None, """\ [DATALAD RUNCMD] junk json === Do not change lines below === { "pwd": "., "cmd": "echo ok >okfile", "exit": 0 } ^^^ Do not change lines above ^^^""") subject, info = get_run_info( None, """\ [DATALAD RUNCMD] fine === Do not change lines below === { "pwd": ".", "cmd": "echo ok >okfile", "exit": 0 } ^^^ Do not change lines above ^^^""") eq_(subject, "fine") assert_dict_equal(info, {"pwd": ".", "cmd": "echo ok >okfile", "exit": 0})
def test_extract_exclude_autometa_regexp(): info, _ = au.extract(ST_DATA["rows"], url_format="{name}_{debut_season}.com", filename_format="{age_group}//{now_dead}//{name}.csv", exclude_autometa="ea") expects = [{ "name": "will", "age_group": "kid" }, { "name": "bob", "age_group": "adult" }, { "name": "scott", "age_group": "adult" }, { "name": "max", "age_group": "kid" }] for d, expect in zip(info, expects): assert_dict_equal(d["meta_args"], expect)
def test_addurls(self=None, path=None): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return len(ds.repo.get_revisions("git-annex")) n_annex_commits = get_annex_commit_counts() # Meanwhile also test that we can specify path relative # to the top of the dataset, as we generally treat paths in # Python API, and it will be the one saved in commit # message record json_file = op.relpath(self.json_file, ds.path) ds.addurls(json_file, "{url}", "{name}", exclude_autometa="(md5sum|size)", result_renderer='disabled') ok_startswith(ds.repo.format_commit('%b', DEFAULT_BRANCH), f"url_file='{json_file}'") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(op.join(ds.path, fname)) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. # Also ignore if on Windows as it seems as if a git-annex bug # leads to separate meta data commits: # https://github.com/datalad/datalad/pull/5202#discussion_r535429704 if not (dl_cfg.get('datalad.fake-dates') or on_windows): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite", result_renderer='disabled') for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results(ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip", result_renderer='disabled'), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}", result_renderer='disabled') # But it fails if something has changed. ds.unlock("a") with open(op.join(ds.path, "a"), "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}", result_renderer='disabled')
def test_get_paths_by_ds(path=None, otherdspath=None): otherds = Dataset(otherdspath).create() ds = get_deeply_nested_structure(path) # for testing below, a shortcut subds_modified = Dataset(ds.pathobj / 'subds_modified') # check docstrong of get_deeply_nested_structure() to understand # what is being tested here testcases = ( # ( # (<dataset_arg>, <path arg>), # {<path by ds dict>} # [<error list>] # ), # find main dataset, pass-through arbitrary arguments, if no paths # go in, also no paths come out ((path, None), { ds.pathobj: None }, []), # a simple path in the rootds, stays just that, not traversal # into files underneaths ((ds, ['subdir']), { ds.pathobj: [ds.pathobj / 'subdir'] }, []), # same for files, any number, # one record per dataset with multiple files ((ds, [op.join('subdir', 'git_file.txt'), 'directory_untracked']), { ds.pathobj: [ ds.pathobj / 'directory_untracked', ds.pathobj / 'subdir' / 'git_file.txt' ] }, []), # same for a subdataset root -- still reported as part of # the superdataset! ((ds, ['subds_modified']), { ds.pathobj: [subds_modified.pathobj] }, []), # but not with a trailing slash, then it is the subdataset root # itself that becomes the record!!! ((ds, ['subds_modified' + op.sep]), { subds_modified.pathobj: [subds_modified.pathobj] }, []), # however, regardless of the path syntax, each behavior can be forced ((ds, ['subds_modified'], 'sub'), { subds_modified.pathobj: [subds_modified.pathobj] }, []), ((ds, ['subds_modified' + op.sep], 'super'), { ds.pathobj: [subds_modified.pathobj] }, []), # subdataset content is sorted into a subdataset record ((ds, [op.join('subds_modified', 'subdir')]), { subds_modified.pathobj: [ds.pathobj / 'subds_modified' / 'subdir'] }, []), # content from different datasets ends up in different records ((ds, [ op.join('subdir', 'git_file.txt'), op.join('subds_modified', 'subdir'), op.join('subds_modified', 'subds_lvl1_modified') ]), { ds.pathobj: [ds.pathobj / 'subdir' / 'git_file.txt'], subds_modified.pathobj: [ subds_modified.pathobj / 'subdir', subds_modified.pathobj / 'subds_lvl1_modified' ] }, []), # paths not matching existing content are no problem ((ds, [ 'doesnotexist', op.join('subdir', 'nothere'), op.join('subds_modified', 'subdir', 'gone') ]), { ds.pathobj: [ds.pathobj / 'doesnotexist', ds.pathobj / 'subdir' / 'nothere'], subds_modified.pathobj: [subds_modified.pathobj / 'subdir' / 'gone'] }, []), # # now error case # # a path that does sort under the root dataset ((path, [otherds.pathobj / 'totally' / 'different']), {}, [otherds.pathobj / 'totally' / 'different']), ) # evaluate the test cases for inp, pbd_target, error_target in testcases: paths_by_ds, errors = get_paths_by_ds(ds, *inp) assert_dict_equal(pbd_target, paths_by_ds) eq_(error_target, errors) # lastly, some more specialized test # paths get collapsed into dataset records, even when the path # order is not presorted to match individual datasets sequentially paths_by_ds, errors = get_paths_by_ds(ds, ds, [ op.join('subdir', 'git_file.txt'), op.join('subds_modified', 'subdir'), op.join('subdir', 'annexed_file.txt'), ]) eq_(list(paths_by_ds.keys()), [ds.pathobj, subds_modified.pathobj]) # result order (top-level first) is stable, even when a path comes first # that sorts later. Also mixed types are not a problem paths_by_ds, errors = get_paths_by_ds(ds, ds, [ ds.pathobj / 'subds_modified' / 'subdir', op.join('subdir', 'git_file.txt'), op.join('subds_modified', 'subdir', 'annexed_file.txt'), ]) eq_(list(paths_by_ds.keys()), [ds.pathobj, subds_modified.pathobj])
def test_target_ssh_simple(origin=None, src_path=None, target_rootpath=None): ca = dict(result_renderer='disabled') test_fname = 'test-annex.dat' orig = Dataset(origin).create(**ca) (orig.pathobj / test_fname).write_text('some') orig.save(**ca) port = get_ssh_port("datalad-test") # prepare src source = install(src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling(dataset=source, name="local_target", sshurl="ssh://*****:*****@with_testsui(responses=["yes"]) def interactive_assert_create_sshwebserver(): assert_create_sshwebserver( dataset=source, name="local_target", sshurl="ssh://datalad-test" + target_path, publish_by_default=DEFAULT_BRANCH, existing='replace', ui=have_webui(), ) interactive_assert_create_sshwebserver() eq_("ssh://datalad-test" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: lclcfg = AnnexRepo(src_path).config eq_(lclcfg.get('remote.local_target.annex-ignore'), 'false') # valid uuid eq_(lclcfg.get('remote.local_target.annex-uuid').count('-'), 4) # should be added too, even if URL matches prior state eq_(lclcfg.get('remote.local_target.push'), DEFAULT_BRANCH) # again, by explicitly passing urls. Since we are on datalad-test, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://datalad-test", target_dir=target_path, target_url=target_path, target_pushurl="ssh://datalad-test" + target_path, ui=have_webui(), ) @with_testsui(responses=['yes']) def interactive_assert_create_sshwebserver(): assert_create_sshwebserver(existing='replace', **cpkwargs) interactive_assert_create_sshwebserver() if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://datalad-test" + target_path, source.repo.get_remote_url("local_target", push=True)) if have_webui(): from datalad_deprecated.tests.test_create_sibling_webui import ( assert_publish_with_ui, ) assert_publish_with_ui(target_path) # now, push should work: push(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex # and ignore .git/logs content (gh-5298) for f in list(digests): if f.startswith('.git/annex/mergedrefs') \ or f.startswith('.git/logs/'): digests.pop(f) mtimes.pop(f) if not have_webui(): # the rest of the test assumed that we have uploaded a UI return orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', } ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) ok_(modified_files.issuperset(ok_modified_files))
def test_discover_ds_trace(path=None, otherdir=None): ds = make_demo_hierarchy_datasets( path, {k: v for k, v in demo_hierarchy.items() if k in ['a', 'd']}) a = opj(ds.path, 'a') aa = opj(a, 'aa') d = opj(ds.path, 'd') db = opj(d, 'db') # we have to check whether we get the correct hierarchy, as the test # subject is also involved in this assert_true(exists(opj(db, 'file_db'))) ds.save(recursive=True) assert_repo_status(ds.path) # now two datasets which are not available locally, but we # know about them (e.g. from metadata) dba = opj(db, 'sub', 'dba') dbaa = opj(dba, 'subsub', 'dbaa') for input, eds, goal in ( ([], None, {}), ([ds.path], None, {}), ([otherdir], None, {}), ([opj(ds.path, 'nothere')], None, {}), ([opj(d, 'nothere')], None, {}), ([opj(db, 'nothere')], None, {}), ([a], None, { ds.path: set([a]) }), ([aa, a], None, { ds.path: set([a]), a: set([aa]) }), ([db], None, { ds.path: set([d]), d: set([db]) }), ([opj(db, 'file_db')], None, { ds.path: set([d]), d: set([db]) }), # just a regular non-existing path ([dba], None, {}), # but if we inject this knowledge it must come back out # as the child of the closest existing dataset ([dba], [dba], { ds.path: set([d]), d: set([db]), db: set([dba]) }), # regardless of the depth ([dbaa], [dbaa], { ds.path: set([d]), d: set([db]), db: set([dbaa]) }), ([dba, dbaa], [dba, dbaa], { ds.path: set([d]), d: set([db]), db: set([dba, dbaa]) }), # we can simply add existing and non-existing datasets to the # include list get the desired result ([d, dba, dbaa], [d, dba, dbaa], { ds.path: set([d]), d: set([db]), db: set([dba, dbaa]) }), ): spec = {} discover_dataset_trace_to_targets(ds.path, input, [], spec, includeds=eds) assert_dict_equal(spec, goal)