def test_dont_trip_over_missing_subds(path): ds1 = Dataset(opj(path, 'ds1')).create() ds2 = Dataset(opj(path, 'ds2')).create() subds2 = ds1.install( source=ds2.path, path='subds2', result_xfm='datasets', return_type='item-or-list') assert_true(subds2.is_installed()) assert_in('subds2', ds1.subdatasets(result_xfm='relpaths')) subds2.uninstall() assert_in('subds2', ds1.subdatasets(result_xfm='relpaths')) assert_false(subds2.is_installed()) # see if it wants to talk to github (and fail), or if it trips over something # before assert_raises(gh.BadCredentialsException, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******') # inject remote config prior run assert_not_in('github', ds1.repo.get_remotes()) # fail on existing ds1.repo.add_remote('github', 'http://nothere') assert_raises(ValueError, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******') # talk to github when existing is OK assert_raises(gh.BadCredentialsException, ds1.create_sibling_github, 'bogus', recursive=True, github_login='******', existing='reconfigure') # return happy emptiness when all is skipped assert_equal( ds1.create_sibling_github( 'bogus', recursive=True, github_login='******', existing='skip'), [])
def test_aggregate_with_missing_or_duplicate_id(path): # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subds.remove(opj('.datalad', 'config'), if_dirty='ignore') assert_false(exists(opj(subds.path, '.datalad', 'config'))) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second native set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # and we know nothing subsub for name in ('grandchild_äöü東',): assert_true(sum([s.get('name', '') == assure_unicode(name) for s in meta])) # but search should not fail with swallow_outputs(): res1 = list(search_('.', regex=True, dataset=ds)) assert res1 # and let's see now if we wouldn't fail if dataset is duplicate if we # install the same dataset twice subds_clone = ds.install(source=subds.path, path="subds2") with swallow_outputs(): res2 = list(search_('.', regex=True, dataset=ds))
def test_ignore_nondatasets(path): # we want to ignore the version/commits for this test def _kill_time(meta): for m in meta: for k in ('version', 'dcterms:modified'): if k in m: del m[k] return meta ds = Dataset(path).create() meta = _kill_time(get_metadata(ds)) n_subm = 0 # placing another repo in the dataset has no effect on metadata for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')): subm_path = opj(ds.path, subpath) r = cls(subm_path, create=True) with open(opj(subm_path, 'test'), 'w') as f: f.write('test') r.add('test') r.commit('some') assert_true(Dataset(subm_path).is_installed()) assert_equal(meta, _kill_time(get_metadata(ds))) # making it a submodule has no effect either ds.add(subpath) assert_equal(len(ds.get_subdatasets()), n_subm + 1) assert_equal(meta, _kill_time(get_metadata(ds))) n_subm += 1
def test_addurls_dry_run(path): ds = Dataset(path).create(force=True) with chpwd(path): json_file = "links.json" with open(json_file, "w") as jfh: json.dump([{"url": "URL/a.dat", "name": "a", "subdir": "foo"}, {"url": "URL/b.dat", "name": "b", "subdir": "bar"}, {"url": "URL/c.dat", "name": "c", "subdir": "foo"}], jfh) ds.save(message="setup") with swallow_logs(new_level=logging.INFO) as cml: ds.addurls(json_file, "{url}", "{subdir}//{_url_filename_root}", dry_run=True) for dir_ in ["foo", "bar"]: assert_in("Would create a subdataset at {}".format(dir_), cml.out) assert_in( "Would download URL/a.dat to {}".format( os.path.join(path, "foo", "BASE")), cml.out) assert_in("Metadata: {}".format([u"name=a", u"subdir=foo"]), cml.out)
def test_addurls_url_parts(self, path): ds = Dataset(path).create(force=True) with chpwd(path): ds.addurls(self.json_file, "{url}", "{_url0}/{_url_basename}") for fname in ["a.dat", "b.dat", "c.dat"]: ok_exists(op.join("udir", fname))
def test_bf2458(src, dst): ds = Dataset(src).create(force=True) ds.save(to_git=False) # no clone (empty) into new dst clone = install(source=ds.path, path=dst) # content is not here eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')]) # check that plain metadata access does not `get` stuff clone.metadata('.', on_failure='ignore') eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
def test_addurls_repindex(self, path): ds = Dataset(path).create(force=True) with chpwd(path): with assert_raises(IncompleteResultsError) as raised: ds.addurls(self.json_file, "{url}", "{subdir}") assert_in("There are file name collisions", str(raised.exception)) ds.addurls(self.json_file, "{url}", "{subdir}-{_repindex}") for fname in ["foo-0", "bar-0", "foo-1"]: ok_exists(fname)
def test_addurls_metafail(self, path): ds = Dataset(path).create(force=True) # Force failure by passing a non-existent file name to annex. fn = ds.repo.set_metadata_ def set_meta(_, **kwargs): for i in fn("wreaking-havoc-and-such", **kwargs): yield i with chpwd(path), patch.object(ds.repo, 'set_metadata_', set_meta): with assert_raises(IncompleteResultsError): ds.addurls(self.json_file, "{url}", "{name}")
def test_add_file(client, annex_path): ds_id = 'ds000001' file_data = 'Test dataset README' response = client.simulate_post('/datasets/{}/files/README'.format(ds_id), body=file_data) assert response.status == falcon.HTTP_OK # Load the dataset to check for this file ds_obj = Dataset(os.path.join(annex_path, ds_id)) test_files = ds_obj.get('README') assert test_files assert len(test_files) == 1 with open(test_files.pop()['path']) as f: assert f.read() == file_data
def test_specialremote(dspath=None, remotepath=None): ds = Dataset(dspath).create() ds.repo.call_annex([ 'initremote', 'myremote', 'type=directory', f'directory={remotepath}', 'encryption=none' ]) res = ds.siblings('query', result_renderer='disabled') assert_in_results( res, **{ 'name': 'myremote', 'annex-type': 'directory', 'annex-directory': remotepath })
def test_addurls_metafail(self, path): ds = Dataset(path).create(force=True) # Force failure by passing a non-existent file name to annex. fn = ds.repo.set_metadata_ def set_meta(_, **kwargs): for i in fn("wreaking-havoc-and-such", **kwargs): yield i with patch.object(ds.repo, 'set_metadata_', set_meta): with assert_raises(IncompleteResultsError): ds.addurls(self.json_file, "{url}", "{name}")
def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir)
def test_read_access(store_path, store_url, ds_path): ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() if ds.repo.is_managed_branch(): # TODO: on crippled FS copytree to populate store doesn't seem to work. # Or may be it's just the serving via HTTP that doesn't work. # Either way, after copytree and fsck, whereis doesn't report # the store as an available source. raise SkipTest("Skip on crippled FS") files = [Path('one.txt'), Path('subdir') / 'two'] store_path = Path(store_path) url = "ria+" + store_url init_opts = common_init_opts + ['url={}'.format(url)] io = LocalIO() create_store(io, store_path, '1') create_ds_in_store(io, store_path, ds.id, '2', '1') ds.repo.init_remote('ora-remote', options=init_opts) ds.repo.fsck(remote='ora-remote', fast=True) store_uuid = ds.siblings(name='ora-remote', return_type='item-or-list')['annex-uuid'] here_uuid = ds.siblings(name='here', return_type='item-or-list')['annex-uuid'] # nothing in store yet: for f in files: known_sources = ds.repo.whereis(str(f)) assert_in(here_uuid, known_sources) assert_not_in(store_uuid, known_sources) annex_obj_target = str(store_path / ds.id[:3] / ds.id[3:] / 'annex' / 'objects') shutil.rmtree(annex_obj_target) shutil.copytree(src=str(ds.repo.dot_git / 'annex' / 'objects'), dst=annex_obj_target) ds.repo.fsck(remote='ora-remote', fast=True) # all in store now: for f in files: known_sources = ds.repo.whereis(str(f)) assert_in(here_uuid, known_sources) assert_in(store_uuid, known_sources) ds.drop('.') res = ds.get('.') assert_equal(len(res), 2) assert_result_count(res, 2, status='ok', type='file', action='get', message="from ora-remote...")
def test_create_osf_simple(path): ds = Dataset(path).create(force=True) ds.save() file1 = Path('ds') / "file1.txt" create_results = ds.create_sibling_osf(name="osf") assert_result_count(create_results, 2, status='ok') assert_result_count(create_results, 1, status='ok', type='dataset', name="osf-storage", path=ds.path) assert_result_count(create_results, 1, status='ok', type='sibling', name="osf", path=ds.path) # if we got here, we created something at OSF; # make sure, we clean up afterwards try: # special remote is configured: remote_log = ds.repo.call_git( ['cat-file', 'blob', 'git-annex:remote.log']) assert_in("node={}".format(create_results[0]['id']), remote_log) # copy files over ds.repo.copy_to('.', "osf-storage") whereis = ds.repo.whereis(str(file1)) here = ds.config.get("annex.uuid") # files should be 'here' and on remote end: assert_equal(len(whereis), 2) assert_in(here, whereis) # drop content here ds.drop('.') whereis = ds.repo.whereis(str(file1)) # now on remote end only assert_equal(len(whereis), 1) assert_not_in(here, whereis) # and get content again from remote: ds.get('.') whereis = ds.repo.whereis(str(file1)) assert_equal(len(whereis), 2) assert_in(here, whereis) finally: # clean remote end: cred = get_credentials(allow_interactive=False) osf = OSF(**cred) delete_node(osf.session, create_results[0]['id'])
def test_create_push_url(detection_path, ds_path, store_path): store_path = Path(store_path) ds_path = Path(ds_path) detection_path = Path(detection_path) ds = Dataset(ds_path).create(force=True) ds.save() # patch SSHConnection to signal it was used: from datalad.support.sshconnector import SSHManager def detector(f, d): @wraps(f) def _wrapper(*args, **kwargs): d.touch() return f(*args, **kwargs) return _wrapper url = "ria+{}".format(store_path.as_uri()) push_url = "ria+ssh://datalad-test{}".format(store_path.as_posix()) assert not detection_path.exists() with patch('datalad.support.sshconnector.SSHManager.get_connection', new=detector(SSHManager.get_connection, detection_path)): ds.create_sibling_ria(url, "datastore", push_url=push_url) # used ssh_manager despite file-url hence used push-url (ria+ssh): assert detection_path.exists() # correct config in special remote: sr_cfg = ds.repo.get_special_remotes()[ds.siblings( name='datastore-storage')[0]['annex-uuid']] eq_(sr_cfg['url'], url) eq_(sr_cfg['push-url'], push_url) # git remote based on url (local path): eq_(ds.config.get("remote.datastore.url"), (store_path / ds.id[:3] / ds.id[3:]).as_posix()) eq_( ds.config.get("remote.datastore.pushurl"), "ssh://datalad-test{}".format( (store_path / ds.id[:3] / ds.id[3:]).as_posix())) # git-push uses SSH: detection_path.unlink() ds.push('.', to="datastore", data='nothing') assert detection_path.exists() # data push # Note, that here the patching has no effect, since the special remote # is running in a subprocess of git-annex. Hence we can't detect SSH # usage really. However, ORA remote is tested elsewhere - if it succeeds # all should be good wrt `create-sibling-ria`. ds.repo.call_annex(['copy', '.', '--to', 'datastore-storage'])
def test_sibling_enable_sameas(repo, clone_path): ds = Dataset(repo.path) create_tree(ds.path, {"f0": "0"}) ds.save(path="f0") ds.repo.copy_to(["f0"], remote="r_dir") ds.repo.drop(["f0"]) ds_cloned = clone(ds.path, clone_path) assert_false(ds_cloned.repo.file_has_content("f0")) res = ds_cloned.siblings(action="enable", name="r_rsync") assert_status("ok", res) ds_cloned.get(path=["f0"]) ok_(ds_cloned.repo.file_has_content("f0"))
def test_sibling_path_is_posix(basedir=None, otherpath=None): ds_source = Dataset(opj(basedir, "source")).create() # add remote with system native path ds_source.siblings(action="add", name="donotexist", url=otherpath, result_renderer='disabled') res = ds_source.siblings(action="query", name="donotexist", result_renderer='disabled', return_type='item-or-list') # path URL should come out POSIX as if `git clone` had configured it for origin # https://github.com/datalad/datalad/issues/3972 eq_(res['url'], Path(otherpath).as_posix())
def tag_releases(self, dandiset: RemoteDandiset, ds: Dataset, push: bool) -> None: if not self.config.enable_tags: return log.info("Tagging releases for Dandiset %s", dandiset.identifier) versions = [ v for v in dandiset.get_versions() if v.identifier != "draft" ] for v in versions: if readcmd("git", "tag", "-l", v.identifier, cwd=ds.path): log.debug("Version %s already tagged", v.identifier) else: log.info("Tagging version %s", v.identifier) self.mkrelease(dandiset.for_version(v), ds, push=push) if versions: latest = max(map(attrgetter("identifier"), versions), key=Version) description = readcmd("git", "describe", "--tags", "--long", "--always", cwd=ds.path) if "-" not in description: # No tags on default branch merge = True else: m = re.fullmatch( r"(?P<tag>.+)-(?P<distance>[0-9]+)-g(?P<rev>[0-9a-f]+)?", description, ) assert m, f"Could not parse `git describe` output: {description!r}" merge = Version(latest) > Version(m["tag"]) if merge: log.debug("Running: git merge -s ours %s", shlex.quote(latest)) subprocess.run( [ "git", "merge", "-s", "ours", "-m", f"Merge '{latest}' into drafts branch (no differences" " in content merged)", latest, ], cwd=ds.path, check=True, ) if push: ds.push(to="github", jobs=self.config.jobs)
def check_api(no_annex, path): ds = Dataset(path).create(force=True, no_annex=no_annex) ds.add('.') ok_clean_git(ds.path) processed_extractors, skipped_extractors = [], [] for extractor_ep in iter_entry_points('datalad.metadata.extractors'): # we need to be able to query for metadata, even if there is none # from any extractor try: extractor_cls = extractor_ep.load() except Exception as exc: exc_ = str(exc) skipped_extractors += [exc_] continue extractor = extractor_cls( ds, paths=['file.dat']) meta = extractor.get_metadata( dataset=True, content=True) # we also get something for the dataset and something for the content # even if any of the two is empty assert_equal(len(meta), 2) dsmeta, contentmeta = meta assert (isinstance(dsmeta, dict)) assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta) # verify that generator does not blow and has an entry for our # precious file cm = dict(contentmeta) # datalad_core does provide some (not really) information about our # precious file if extractor_ep.name == 'datalad_core': assert 'file.dat' in cm elif extractor_ep.name == 'annex': if not no_annex: # verify correct key, which is the same for all files of 0 size assert_equal( cm['file.dat']['key'], 'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat' ) else: # no metadata on that file assert not cm processed_extractors.append(extractor_ep.name) assert "datalad_core" in processed_extractors, \ "Should have managed to find at least the core extractor extractor" if skipped_extractors: raise SkipTest( "Not fully tested/succeded since some extractors failed" " to load:\n%s" % ("\n".join(skipped_extractors)))
def test_drop_after(self=None, path=None): ds = Dataset(path).create(force=True) ds.repo.set_gitattributes([('a*', {'annex.largefiles': 'nothing'})]) # make some files go to git, so we could test that we do not blow # while trying to drop what is in git not annex res = ds.addurls(self.json_file, '{url}', '{name}', drop_after=True, result_renderer='disabled') assert_result_count(res, 3, action='addurl', status='ok') # a, b, c even if a goes to git assert_result_count(res, 2, action='drop', status='ok') # b, c
def test_add_readme(path): ds = Dataset(path).create(force=True) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) assert_status('ok', ds.plugin('add_readme')) # should use default name eq_( open(opj(path, 'README.md')).read(), """\ # Dataset "demo_ds" this is for play ### Authors - Betty - Tom ### License PDDL ## General information This is a DataLad dataset (id: {id}). For more information on DataLad and on how to work with its datasets, see the DataLad documentation at: http://docs.datalad.org """.format( id=ds.id)) # should skip on re-run assert_status('notneeded', ds.plugin('add_readme'))
def test_addurls_url_on_collision_choose(self=None, path=None): ds = Dataset(path).create(force=True) data = deepcopy(self.data) for row in data: row["name"] = "a" with patch("sys.stdin", new=StringIO(json.dumps(data))): assert_in_results(ds.addurls("-", "{url}", "{name}", on_failure="ignore"), action="addurls", status="error") with patch("sys.stdin", new=StringIO(json.dumps(data))): assert_in_results(ds.addurls("-", "{url}", "{name}", on_collision="error-if-different", on_failure="ignore"), action="addurls", status="error") with patch("sys.stdin", new=StringIO(json.dumps(data))): ds.addurls("-", "{url}", "{name}-first", on_collision="take-first") ok_file_has_content(op.join(ds.path, "a-first"), "a content", strip=True) with patch("sys.stdin", new=StringIO(json.dumps(data))): ds.addurls("-", "{url}", "{name}-last", on_collision="take-last") ok_file_has_content(op.join(ds.path, "a-last"), "c content", strip=True)
def test_addurls_subdataset(self, path): ds = Dataset(path).create(force=True) with chpwd(path): for save in True, False: label = "save" if save else "nosave" hexsha_before = ds.repo.get_hexsha() ds.addurls(self.json_file, "{url}", "{subdir}-" + label + "//{name}", save=save) hexsha_after = ds.repo.get_hexsha() for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]: ok_exists(fname.format(label)) assert_true(save ^ (hexsha_before == hexsha_after)) assert_true(save ^ ds.repo.dirty) # Now save the "--nosave" changes and check that we have # all the subdatasets. ds.add(".") eq_(set(subdatasets(ds, recursive=True, result_xfm="relpaths")), {"foo-save", "bar-save", "foo-nosave", "bar-nosave"}) # We don't try to recreate existing subdatasets. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}") assert_in("Not creating subdataset at existing path", cml.out)
def check_api(no_annex, path): ds = Dataset(path).create(force=True, no_annex=no_annex) ds.save() assert_repo_status(ds.path) processed_extractors, skipped_extractors = [], [] for extractor_ep in iter_entry_points('datalad.metadata.extractors'): # we need to be able to query for metadata, even if there is none # from any extractor try: extractor_cls = extractor_ep.load() except Exception as exc: exc_ = str(exc) skipped_extractors += [exc_] continue extractor = extractor_cls( ds, paths=['file.dat']) meta = extractor.get_metadata( dataset=True, content=True) # we also get something for the dataset and something for the content # even if any of the two is empty assert_equal(len(meta), 2) dsmeta, contentmeta = meta assert (isinstance(dsmeta, dict)) assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta) # verify that generator does not blow and has an entry for our # precious file cm = dict(contentmeta) # datalad_core does provide some (not really) information about our # precious file if extractor_ep.name == 'datalad_core': assert 'file.dat' in cm elif extractor_ep.name == 'annex': if not no_annex: # verify correct key, which is the same for all files of 0 size assert_equal( cm['file.dat']['key'], 'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat' ) else: # no metadata on that file assert not cm processed_extractors.append(extractor_ep.name) assert "datalad_core" in processed_extractors, \ "Should have managed to find at least the core extractor extractor" if skipped_extractors: raise SkipTest( "Not fully tested/succeded since some extractors failed" " to load:\n%s" % ("\n".join(skipped_extractors)))
def test_reproin_largely_smoke(tmpdir, heuristic, invocation): is_bids = True if heuristic == 'reproin' else False arg = "--random-seed 1 -f %s -c dcm2niix -o %s" \ % (heuristic, tmpdir) if is_bids: arg += " -b" arg += " --datalad " args = ( arg + invocation ).split(' ') # Test some safeguards if invocation == "--files %s" % TESTS_DATA_PATH: # Multiple subjects must not be specified -- only a single one could # be overridden from the command line with pytest.raises(ValueError): runner(args + ['--subjects', 'sub1', 'sub2']) if heuristic != 'reproin': # if subject is not overriden, raise error with pytest.raises(NotImplementedError): runner(args) return runner(args) ds = Dataset(str(tmpdir)) assert ds.is_installed() assert not ds.repo.dirty head = ds.repo.get_hexsha() # and if we rerun -- should fail lgr.info( "RERUNNING, expecting to FAIL since the same everything " "and -c specified so we did conversion already" ) with pytest.raises(RuntimeError): runner(args) # but there should be nothing new assert not ds.repo.dirty assert head == ds.repo.get_hexsha() # unless we pass 'overwrite' flag runner(args + ['--overwrite']) # but result should be exactly the same, so it still should be clean # and at the same commit assert ds.is_installed() assert not ds.repo.dirty assert head == ds.repo.get_hexsha()
def test_zip_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True, no_annex=True) ds.add('.') with chpwd(path): ds.export_archive(filename='my', archivetype='zip') assert_true(os.path.exists('my.zip')) custom1_md5 = md5sum('my.zip') time.sleep(1.1) ds.export_archive(filename='my', archivetype='zip') assert_equal(md5sum('my.zip'), custom1_md5) # should be able to export without us cd'ing to that ds directory ds.export_archive(filename=ds.path, archivetype='zip') default_name = 'datalad_{}.zip'.format(ds.id) assert_true(os.path.exists(os.path.join(ds.path, default_name)))
def test_docker(path): # Singularity's "docker://" scheme. ds = Dataset(path).create() ds.containers_add( "bb", url=("docker://busybox@sha256:" "7964ad52e396a6e045c39b5a44438424ac52e12e4d5a25d94895f2058cb863a0" )) img = op.join(ds.path, ".datalad", "environments", "bb", "image") assert_result_count(ds.containers_list(), 1, path=img, name="bb") ok_clean_git(path) WitlessRunner(cwd=ds.path).run( ["datalad", "containers-run", "ls", "/singularity"], protocol=StdOutCapture)
def test_addurls_deeper(self, path): ds = Dataset(path).create(force=True) ds.addurls(self.json_file, "{url}", "{subdir}//adir/{subdir}-again//other-ds//bdir/{name}") eq_( set(ds.subdatasets(recursive=True, result_xfm="relpaths")), { "foo", "bar", op.join("foo", "adir", "foo-again"), op.join("bar", "adir", "bar-again"), op.join("foo", "adir", "foo-again", "other-ds"), op.join("bar", "adir", "bar-again", "other-ds") }) ok_exists( os.path.join(ds.path, "foo", "adir", "foo-again", "other-ds", "bdir", "a"))
def test_addurls_from_key_invalid_format(self, path): ds = Dataset(path).create(force=True) for fmt in [ "{name}-which-has-no-double-dash", # Invalid hash length. "MD5-s{size}--{md5sum}a", # Invalid hash content. "MD5-s{size}--" + 32 * "q" ]: with assert_raises(IncompleteResultsError): ds.addurls(self.json_file, "{url}", "{name}", key=fmt, exclude_autometa="*")
def annex_path(tmpdir_factory): path = tmpdir_factory.mktemp('annexes') ds_path = str(path.join(DATASET_ID)) # Create an empty dataset for testing ds = Dataset(ds_path) ds.create() ds.no_annex(BIDS_NO_ANNEX) json_path = os.path.join(ds_path, 'dataset_description.json') with open(json_path, 'w') as f: json.dump(DATASET_DESCRIPTION, f, ensure_ascii=False) ds.add(json_path) ds.save(version_tag=SNAPSHOT_ID) # Setup a seed for any new_dataset uses random.seed(42) return path
def setUp(self): pt = test_directory.parent / "BEP032-examples" if pt.exists(): self.dataset = Dataset(str(pt)) self.dataset.clean() self.dataset.update(merge=True) self.dataset.get() else: self.dataset = install( path=str(pt), source="https://gin.g-node.org/NeuralEnsemble/BEP032-examples", get_data=True, ) self.datadir = str(pt) self.savedir = tempfile.mkdtemp()
def test_noop(path, outdir): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save() assert_raises( TypeError, ds.bids2scidata, ) with chpwd(outdir): # to not pollute cwd assert_raises( IncompleteResultsError, ds.bids2scidata, repo_name="dummy", repo_accession='ds1', repo_url='http://example.com', )
def test_reproin_largely_smoke(tmpdir, heuristic, invocation): is_bids = True if heuristic == 'reproin' else False arg = "--random-seed 1 -f %s -c dcm2niix -o %s" \ % (heuristic, tmpdir) if is_bids: arg += " -b" arg += " --datalad " args = ( arg + invocation ).split(' ') # Test some safeguards if invocation == "--files %s" % TESTS_DATA_PATH: # Multiple subjects must not be specified -- only a single one could # be overridden from the command line with pytest.raises(ValueError): runner(args + ['--subjects', 'sub1', 'sub2']) if heuristic != 'reproin': # none other heuristic has mighty infotoids atm with pytest.raises(NotImplementedError): runner(args) return runner(args) ds = Dataset(str(tmpdir)) assert ds.is_installed() assert not ds.repo.dirty head = ds.repo.get_hexsha() # and if we rerun -- should fail lgr.info( "RERUNNING, expecting to FAIL since the same everything " "and -c specified so we did conversion already" ) with pytest.raises(RuntimeError): runner(args) # but there should be nothing new assert not ds.repo.dirty assert head == ds.repo.get_hexsha() # unless we pass 'overwrite' flag runner(args + ['--overwrite']) # but result should be exactly the same, so it still should be clean # and at the same commit assert ds.is_installed() assert not ds.repo.dirty assert head == ds.repo.get_hexsha()
def test_ensure_datalad_remote_init_and_enable_needed(path=None): from datalad.consts import DATALAD_SPECIAL_REMOTE ds = Dataset(path).create(force=True) repo = ds.repo assert_false(repo.get_remotes()) ensure_datalad_remote(repo) assert_in(DATALAD_SPECIAL_REMOTE, repo.get_remotes())
def test_get_metadata(path): ds = Dataset(path).create(force=True) p = MetadataExtractor(ds, []) meta = p._get_dataset_metadata() assert_equal( dumps(meta, sort_keys=True, indent=2), """\ { "author": "Jane Doe <*****@*****.**>", "conformsto": "http://specs.frictionlessdata.io/data-packages", "contributors": [ "Joe Bloggs <*****@*****.**> (http://www.example.com)" ], "description": "Annual Consumer Price Index (CPI) for most countries in the world. Reference year is 2005.", "license": "http://opendatacommons.org/licenses/pddl/", "name": "cpi", "shortdescription": "Annual Consumer Price Index (CPI)", "tag": [ "CPI", "World", "Consumer Price Index", "Annual Data", "The World Bank" ], "version": "2.0.0" }""")
def test_profile_get_repo_files(annex_path, new_dataset): ds_id = os.path.basename(new_dataset.path) ds = Dataset(str(annex_path.join(ds_id))) for each in range(5000): filename = 'file-{}'.format(each) path = os.path.join(new_dataset.path, filename) with open(path, 'a'): os.utime(path) # Add all generated files ds.add('.') # Profile get_repo_files by itself with open('{}.prof'.format(__name__), 'w+b') as fd: vmprof.enable(fd.fileno()) for n in range(1): get_repo_files(ds) vmprof.disable()
def test_dicom2spec(path): # ### SETUP ### dicoms = get_dicom_dataset('structural') ds = Dataset.create(path, cfg_proc=['hirni']) ds.install(source=dicoms, path='acq100') # Note: Recursive, since aggregation wasn't performed in the installed dastasets # TODO: Use get_raw_sd from above instead of this setup ds.meta_aggregate('acq100', into='top', recursive=True) # ### END SETUP ### # TODO: should it be specfile or acq/specfile? => At least doc needed, # if not change res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json') # check for actual location of spec_structural! # => studyds root! assert_result_count(res, 2) assert_result_count(res, 1, path=op.join(ds.path, 'spec_structural.json')) assert_result_count(res, 1, path=op.join(ds.path, '.gitattributes')) ok_clean_git(ds.path) # multiple execution shouldn't change .gitattributes again: from os import stat mtime = stat(op.join(ds.path, '.gitattributes')).st_mtime res = ds.hirni_dicom2spec(path='acq100', spec='spec_structural.json') assert_equal(stat(op.join(ds.path, '.gitattributes')).st_mtime, mtime)
def test_get_metadata(path): ds = Dataset(path).create(force=True) meta = MetadataParser(ds, []).get_metadata(True, False)[0] del meta['@context'] dump = dumps(meta, sort_keys=True, indent=2, ensure_ascii=False) assert_equal( dump, """\ { "author": [ "Mike One", "Anna Two" ], "citation": [ "http://studyforrest.org" ], "comment<BIDSVersion>": "1.0.0-rc3", "conformsto": "http://bids.neuroimaging.io/bids_spec1.0.0-rc3.pdf", "description": "Some description", "fundedby": "We got money from collecting plastic bottles", "license": "PDDL", "name": "studyforrest_phase2" }""") test_fname = opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz') cmeta = list( MetadataParser(ds, [opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz') ]).get_metadata(False, True)[1]) assert_equal(len(cmeta), 1) assert_equal(cmeta[0][0], test_fname) assert_in('comment<participant#handedness>', cmeta[0][1])
def get_raw_dataset(self): # Note: This is lazy to avoid building on import time, since import is part of nose's discovery and executed # before the dependencies. This leads to datalad's ui backend not yet being correctly set, which in turn # let's the cloning hang within progressbar generation. if not self._dspath: import tempfile kwargs = get_tempfile_kwargs() path = tempfile.mkdtemp(**kwargs) f_dicoms = get_dicom_dataset('functional') s_dicoms = get_dicom_dataset('structural') ds = Dataset.create(path, cfg_proc=['hirni']) ds.install(source=f_dicoms, path=op.join('func_acq', 'dicoms')) ds.install(source=s_dicoms, path=op.join('struct_acq', 'dicoms')) # Note: Recursive, since aggregation wasn't performed in the installed dastasets ds.meta_aggregate([ op.join('func_acq', 'dicoms'), op.join('struct_acq', 'dicoms') ], into='top', recursive=True) # TODO: Figure how to add it to things to be removed after tests ran self._dspath = ds.path return self._dspath
def test_initremote(store_path, store_url, ds_path): ds = Dataset(ds_path).create() store_path = Path(store_path) url = "ria+" + store_url init_opts = common_init_opts + ['url={}'.format(url)] # fails on non-RIA URL assert_raises(CommandError, ds.repo.init_remote, 'ora-remote', options=common_init_opts + ['url={}' ''.format(store_path.as_uri())]) # Doesn't actually create a remote if it fails assert_not_in( 'ora-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()]) ds.repo.init_remote('ora-remote', options=init_opts) assert_in( 'ora-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()]) assert_repo_status(ds.path) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log']) assert_in("url={}".format(url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log)
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.save() ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['exif'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_zip_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True, no_annex=True) ds.save() with chpwd(path): ds.export_archive(filename='my', archivetype='zip') assert_true(os.path.exists('my.zip')) custom1_md5 = md5sum('my.zip') time.sleep(1.1) ds.export_archive(filename='my', archivetype='zip') assert_equal(md5sum('my.zip'), custom1_md5) # should be able to export without us cd'ing to that ds directory ds.export_archive(filename=ds.path, archivetype='zip') default_name = 'datalad_{}.zip'.format(ds.id) assert_true(os.path.exists(os.path.join(ds.path, default_name)))
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def test_basic_metadata(path): ds = Dataset(opj(path, 'origin')) meta = get_metadata(ds) assert_equal(sorted(meta[0].keys()), ['@context', 'dcterms:conformsTo']) ds.create(force=True, save=False) # with subdataset sub = ds.create('sub', force=True) ds.save() meta = get_metadata(ds) assert_equal( sorted(meta[0].keys()), ['@context', '@id', 'availableFrom', 'dcterms:conformsTo', 'dcterms:modified', 'type', 'version']) assert_equal(meta[0]['type'], 'Dataset') # clone and get relationship info in metadata sibling = install(opj(path, 'sibling'), source=opj(path, 'origin')) sibling_meta = get_metadata(sibling) assert_equal(sibling_meta[0]['@id'], ds.id) # origin should learn about the clone sibling.repo.push(remote='origin', refspec='git-annex') meta = get_metadata(ds) assert_equal([m['@id'] for m in meta[0]['availableFrom']], [m['@id'] for m in sibling_meta[0]['availableFrom']]) meta = get_metadata(ds, guess_type=True) # without aggregation there is not trace of subdatasets in the metadata assert_not_in('dcterms:hasPart', meta[0])
def test_addurls_version(self, path): ds = Dataset(path).create(force=True) def version_fn(url): if url.endswith("b.dat"): raise ValueError("Scheme error") return url + ".v1" with patch("datalad.plugin.addurls.get_versioned_url", version_fn): with swallow_logs(new_level=logging.WARNING) as cml: ds.addurls(self.json_file, "{url}", "{name}", version_urls=True) assert_in("b.dat", str(cml.out)) names = ["a", "c"] for fname in names: ok_exists(os.path.join(path, fname)) whereis = ds.repo.whereis(names, output="full") for fname, info in whereis.items(): eq_(info[ds.repo.WEB_UUID]['urls'], ["{}udir/{}.dat.v1".format(self.url, fname)])
def check_integration1(login, keyring, path, organization=None, kwargs={}, oauthtokens=None): kwargs = kwargs.copy() if organization: kwargs['github_organization'] = organization ds = Dataset(path).create() if oauthtokens: for oauthtoken in assure_list(oauthtokens): ds.config.add('hub.oauthtoken', oauthtoken, where='local') # so we do not pick up local repo configuration/token repo_name = 'test_integration1' with chpwd(path): # ATM all the github goodness does not care about "this dataset" # so force "process wide" cfg to pick up our defined above oauthtoken cfg.reload(force=True) # everything works just nice, no conflicts etc res = ds.create_sibling_github(repo_name, **kwargs) if organization: url_fmt = 'https://{login}@github.com/{organization}/{repo_name}.git' else: url_fmt = 'https://github.com/{login}/{repo_name}.git' eq_(res, [(ds, url_fmt.format(**locals()), False)]) # but if we rerun - should kaboom since already has this sibling: with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, **kwargs) assert_in("already has a configured sibling", str(cme.exception)) # but we can give it a new name, but it should kaboom since the remote one # exists already with assert_raises(ValueError) as cme: ds.create_sibling_github(repo_name, name="github2", **kwargs) assert_in("already exists on", str(cme.exception)) # we should not leave the broken sibling behind assert_not_in('github2', ds.repo.get_remotes()) # If we ask to reconfigure - should proceed normally ds.create_sibling_github(repo_name, existing='reconfigure', **kwargs) cfg.reload(force=True)
def test_tarball(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save(all_changes=True) committed_date = ds.repo.get_committed_date() with chpwd(path): _mod, tarball1 = ds.export('tarball') assert(not isabs(tarball1)) tarball1 = opj(path, tarball1) default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) assert_equal(tarball1, default_outname) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export('tarball', output=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original tarball filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export('tarball', output=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly three files, and expect no content for any directory assert_equal(nfiles, 3) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport')
def test_addurls_subdataset(self, path): ds = Dataset(path).create(force=True) with chpwd(path): for save in True, False: label = "save" if save else "nosave" ds.addurls(self.json_file, "{url}", "{subdir}-" + label + "//{name}", save=save) subdirs = ["{}-{}".format(d, label) for d in ["foo", "bar"]] subdir_files = dict(zip(subdirs, [["a", "c"], ["b"]])) for subds, fnames in subdir_files.items(): for fname in fnames: ok_exists(op.join(subds, fname)) if save: assert_repo_status(path) else: # The datasets are create and saved ... assert_repo_status(path, modified=subdirs) # but the downloaded files aren't. for subds, fnames in subdir_files.items(): assert_repo_status(subds, added=fnames) # Now save the "--nosave" changes and check that we have # all the subdatasets. ds.save() eq_(set(subdatasets(dataset=ds, recursive=True, result_xfm="relpaths")), {"foo-save", "bar-save", "foo-nosave", "bar-nosave"}) # We don't try to recreate existing subdatasets. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}") assert_in("Not creating subdataset at existing path", cml.out)
def test_addurls_url_filename(self, path): ds = Dataset(path).create(force=True) with chpwd(path): ds.addurls(self.json_file, "{url}", "{_url0}/{_url_filename}") for fname in ["udir/a.dat", "udir/b.dat", "udir/c.dat"]: ok_exists(fname)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
def test_addurls_nonannex_repo(path): ds = Dataset(path).create(force=True, no_annex=True) with assert_raises(IncompleteResultsError) as raised: ds.addurls("dummy_arg0", "dummy_arg1", "dummy_arg2") assert_in("not an annex repo", str(raised.exception))
def test_get_aggregates_fails(path): with chpwd(path), assert_raises(NoDatasetArgumentFound): metadata(get_aggregates=True) ds = Dataset(path).create() res = ds.metadata(get_aggregates=True, on_failure='ignore') assert_result_count(res, 1, path=ds.path, status='impossible')
class supers(SuprocBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir) def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path) def time_installr(self, tarfile_path): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_createadd(self, tarfile_path): assert self.ds.create('newsubds') def time_createadd_to_dataset(self, tarfile_path): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self, tarfile_path): ls(self.ds.path) def time_ls_recursive(self, tarfile_path): ls(self.ds.path, recursive=True) def time_ls_recursive_long_all(self, tarfile_path): ls(self.ds.path, recursive=True, long_=True, all_=True) # TODO: since doesn't really allow to uninstall top level ds... bleh ;) #def time_uninstall(self, tarfile_path): # uninstall(self.ds.path, recursive=True) def time_remove(self, tarfile_path): remove(self.ds.path, recursive=True)
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return int( ds.repo.repo.git.rev_list("--count", "git-annex").strip()) n_annex_commits = get_annex_commit_counts() with chpwd(path): ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(fname) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. if not os.environ.get('DATALAD_FAKE__DATES'): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results( ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open("a", "w") as ofh: ofh.write("changed") ds.save("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_addurls_dropped_urls(self, path): ds = Dataset(path).create(force=True) with chpwd(path), swallow_logs(new_level=logging.WARNING) as cml: ds.addurls(self.json_file, "", "{subdir}//{name}") assert_re_in(r".*Dropped [0-9]+ row\(s\) that had an empty URL", str(cml.out))
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.save() ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything ds.repo.set_metadata( opj('stim', 'stim1.mp3'), init={'importance': 'very'}) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) subds = ds.create('sub', force=True) subsubds = subds.create('subsub', force=True) # aggregate from bottom to top, guess native data, no compacting of graph # should yield 6 meta data sets, one implicit, and one native per dataset # and a second natiev set for the topmost dataset aggregate_metadata(ds, guess_native_type=True, recursive=True) # no only ask the top superdataset, no recursion, just reading from the cache meta = get_metadata( ds, guess_type=False, ignore_subdatasets=False, ignore_cache=False) assert_equal(len(meta), 10) # same schema assert_equal( 10, sum([s.get('@context', {'@vocab': None})['@vocab'] == 'http://schema.org/' for s in meta])) # three different IDs assert_equal(3, len(set([s.get('@id') for s in meta]))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true(sum([s.get('name', None) == assure_unicode(name) for s in meta])) #print(meta) assert_equal( # first implicit, then two natives, then aggregate meta[3]['dcterms:hasPart']['@id'], subds.id) success = False for m in meta: p = m.get('dcterms:hasPart', {}) if p.get('@id', None) == subsubds.id: assert_equal(opj('sub', 'subsub'), p.get('location', None)) success = True assert_true(success) # save the toplevel dataset only (see below) ds.save('with aggregated meta data', all_changes=True) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path) # ID mechanism works assert_equal(ds.id, clone.id) # get fresh meta data, the implicit one for the top-most datasets should # differ, but the rest not clonemeta = get_metadata( clone, guess_type=False, ignore_subdatasets=False, ignore_cache=False) # make sure the implicit md for the topmost come first assert_equal(clonemeta[0]['@id'], clone.id) assert_equal(clonemeta[0]['@id'], ds.id) assert_equal(clone.repo.get_hexsha(), ds.repo.get_hexsha()) assert_equal(clonemeta[0]['version'], ds.repo.get_hexsha()) # all but the implicit is identical assert_equal(clonemeta[1:], meta[1:]) # the implicit md of the clone should list a dataset ID for its subds, # although it has not been obtained! assert_equal( clonemeta[3]['dcterms:hasPart']['@id'], subds.id) # now obtain a subdataset in the clone and the IDs should be updated clone.install('sub') partial = get_metadata(clone, guess_type=False, ignore_cache=True) # ids don't change assert_equal(partial[0]['@id'], clonemeta[0]['@id']) # datasets are properly connected assert_equal(partial[1]['dcterms:hasPart']['@id'], partial[2]['@id']) # query smoke test if os.environ.get('DATALAD_TESTS_NONETWORK'): raise SkipTest assert_equal(len(list(clone.search('mother'))), 1) assert_equal(len(list(clone.search('MoTHER'))), 1) # case insensitive child_res = list(clone.search('child')) assert_equal(len(child_res), 2) # little helper to match names def assert_names(res, names, path=clone.path): assert_equal(list(map(itemgetter(0), res)), [opj(path, n) for n in names]) # should yield (location, report) tuples assert_names(child_res, ['sub', 'sub/subsub']) # result should be identical to invoking search from api # and search_ should spit out locations out with swallow_outputs() as cmo: res = list(search_('child', dataset=clone)) assert_equal(res, child_res) assert_in(res[0][0], cmo.out) # and overarching search_ just for smoke testing of processing outputs # and not puking (e.g. under PY3) with swallow_outputs() as cmo: assert list(search_('.', regex=True, dataset=clone)) assert cmo.out # test searching among specified properties only assert_names(clone.search('i', search='name'), ['sub', 'sub/subsub']) assert_names(clone.search('i', search='keywords'), ['.']) # case shouldn't matter assert_names(clone.search('i', search='Keywords'), ['.']) assert_names(clone.search('i', search=['name', 'keywords']), ['.', 'sub', 'sub/subsub']) # without report_matched, we are getting none of the fields assert(all([not x for x in map(itemgetter(1), child_res)])) # but we would get all if asking for '*' assert(all([len(x) >= 9 for x in map(itemgetter(1), list(clone.search('child', report='*')))])) # but we would get only the matching name if we ask for report_matched assert_equal( set(map(lambda x: tuple(x[1].keys()), clone.search('child', report_matched=True))), set([('name',)]) ) # and the additional field we might have asked with report assert_equal( set(map(lambda x: tuple(sorted(x[1].keys())), clone.search('child', report_matched=True, report=['schema:type']))), set([('name', 'schema:type')]) ) # and if we ask report to be 'empty', we should get no fields child_res_empty = list(clone.search('child', report='')) assert_equal(len(child_res_empty), 2) assert_equal( set(map(lambda x: tuple(x[1].keys()), child_res_empty)), set([tuple()]) ) # more tests on returned paths: assert_names(clone.search('datalad'), ['.', 'sub', 'sub/subsub']) # if we clone subdataset and query for value present in it and its kid clone_sub = clone.install('sub') assert_names(clone_sub.search('datalad'), ['.', 'subsub'], clone_sub.path) # Test 'and' for multiple search entries assert_equal(len(list(clone.search(['child', 'bids']))), 2) assert_equal(len(list(clone.search(['child', 'subsub']))), 1) assert_equal(len(list(clone.search(['bids', 'sub']))), 2) res = list(clone.search('.*', regex=True)) # with regex assert_equal(len(res), 3) # one per dataset # we do search, not match assert_equal(len(list(clone.search('randchild', regex=True))), 1) assert_equal(len(list(clone.search(['gr.nd', 'ch.ld'], regex=True))), 1) assert_equal(len(list(clone.search('randchil.', regex=True))), 1) assert_equal(len(list(clone.search('^randchild.*', regex=True))), 0) assert_equal(len(list(clone.search('^grandchild.*', regex=True))), 1) assert_equal(len(list(clone.search('grandchild'))), 1)
def test_archive(path): ds = Dataset(opj(path, 'ds')).create(force=True) ds.save() committed_date = ds.repo.get_commit_date() default_outname = opj(path, 'datalad_{}.tar.gz'.format(ds.id)) with chpwd(path): res = list(ds.export_archive()) assert_status('ok', res) assert_result_count(res, 1) assert(isabs(res[0]['path'])) assert_true(os.path.exists(default_outname)) custom_outname = opj(path, 'myexport.tar.gz') # feed in without extension ds.export_archive(filename=custom_outname[:-7]) assert_true(os.path.exists(custom_outname)) custom1_md5 = md5sum(custom_outname) # encodes the original archive filename -> different checksum, despit # same content assert_not_equal(md5sum(default_outname), custom1_md5) # should really sleep so if they stop using time.time - we know time.sleep(1.1) ds.export_archive(filename=custom_outname) # should not encode mtime, so should be identical assert_equal(md5sum(custom_outname), custom1_md5) def check_contents(outname, prefix): with tarfile.open(outname) as tf: nfiles = 0 for ti in tf: # any annex links resolved assert_false(ti.issym()) ok_startswith(ti.name, prefix + '/') assert_equal(ti.mtime, committed_date) if '.datalad' not in ti.name: # ignore any files in .datalad for this test to not be # susceptible to changes in how much we generate a meta info nfiles += 1 # we have exactly four files (includes .gitattributes for default # MD5E backend), and expect no content for any directory assert_equal(nfiles, 4) check_contents(default_outname, 'datalad_%s' % ds.id) check_contents(custom_outname, 'myexport') # now loose some content ds.drop('file_up', check=False) assert_raises(IOError, ds.export_archive, filename=opj(path, 'my')) ds.export_archive(filename=opj(path, 'partial'), missing_content='ignore') assert_true(os.path.exists(opj(path, 'partial.tar.gz')))