def test_addurls_dry_run(path): ds = Dataset(path).create(force=True) with chpwd(path): json_file = "links.json" with open(json_file, "w") as jfh: json.dump([{"url": "URL/a.dat", "name": "a", "subdir": "foo"}, {"url": "URL/b.dat", "name": "b", "subdir": "bar"}, {"url": "URL/c.dat", "name": "c", "subdir": "foo"}], jfh) ds.add(".", message="setup") with swallow_logs(new_level=logging.INFO) as cml: ds.addurls(json_file, "{url}", "{subdir}//{_url_filename_root}", dry_run=True) for dir_ in ["foo", "bar"]: assert_in("Would create a subdataset at {}".format(dir_), cml.out) assert_in( "Would download URL/a.dat to {}".format( os.path.join(path, "foo", "BASE")), cml.out) assert_in("Metadata: {}".format([u"name=a", u"subdir=foo"]), cml.out)
def test_addurls_subdataset(self, path): ds = Dataset(path).create(force=True) with chpwd(path): for save in True, False: label = "save" if save else "nosave" hexsha_before = ds.repo.get_hexsha() ds.addurls(self.json_file, "{url}", "{subdir}-" + label + "//{name}", save=save) hexsha_after = ds.repo.get_hexsha() for fname in ["foo-{}/a", "bar-{}/b", "foo-{}/c"]: ok_exists(fname.format(label)) assert_true(save ^ (hexsha_before == hexsha_after)) assert_true(save ^ ds.repo.dirty) # Now save the "--nosave" changes and check that we have # all the subdatasets. ds.add(".") eq_(set(subdatasets(ds, recursive=True, result_xfm="relpaths")), {"foo-save", "bar-save", "foo-nosave", "bar-nosave"}) # We don't try to recreate existing subdatasets. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{subdir}-nosave//{name}") assert_in("Not creating subdataset at existing path", cml.out)
def test_ignore_nondatasets(path): # we want to ignore the version/commits for this test def _kill_time(meta): for m in meta: for k in ('version', 'shasum'): if k in m: del m[k] return meta ds = Dataset(path).create() meta = _kill_time(ds.metadata(reporton='datasets', on_failure='ignore')) n_subm = 0 # placing another repo in the dataset has no effect on metadata for cls, subpath in ((GitRepo, 'subm'), (AnnexRepo, 'annex_subm')): subm_path = opj(ds.path, subpath) r = cls(subm_path, create=True) with open(opj(subm_path, 'test'), 'w') as f: f.write('test') r.add('test') r.commit('some') assert_true(Dataset(subm_path).is_installed()) assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore'))) # making it a submodule has no effect either ds.add(subpath) assert_equal(len(ds.subdatasets()), n_subm + 1) assert_equal(meta, _kill_time(ds.metadata(reporton='datasets', on_failure='ignore'))) n_subm += 1
def test_audio(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'audio.mp3'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('audio.mp3') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['audio'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta) uniques = ds.metadata( reporton='datasets', return_type='item-or-list')['metadata']['datalad_unique_content_properties'] # test file has it, but uniques have it blanked out, because the extractor considers it worthless # for discovering whole datasets assert_in('bitrate', meta) eq_(uniques['audio']['bitrate'], None) # 'date' field carries not value, hence gets exclude from the unique report assert_in('date', meta) assert(not meta['date']) assert_not_in('date', uniques['audio'])
def test_addurls(self, path): ds = Dataset(path).create(force=True) def get_annex_commit_counts(): return int( ds.repo.repo.git.rev_list("--count", "git-annex").strip()) n_annex_commits = get_annex_commit_counts() with chpwd(path): ds.addurls(self.json_file, "{url}", "{name}") filenames = ["a", "b", "c"] for fname in filenames: ok_exists(fname) for (fname, meta), subdir in zip(ds.repo.get_metadata(filenames), ["foo", "bar", "foo"]): assert_dict_equal(meta, {"subdir": [subdir], "name": [fname]}) # Ignore this check if we're faking dates because that disables # batch mode. if not os.environ.get('DATALAD_FAKE__DATES'): # We should have two new commits on the git-annex: one for the # added urls and one for the added metadata. eq_(n_annex_commits + 2, get_annex_commit_counts()) # Add to already existing links, overwriting. with swallow_logs(new_level=logging.DEBUG) as cml: ds.addurls(self.json_file, "{url}", "{name}", ifexists="overwrite") for fname in filenames: assert_in("Removing {}".format(os.path.join(path, fname)), cml.out) # Add to already existing links, skipping. assert_in_results( ds.addurls(self.json_file, "{url}", "{name}", ifexists="skip"), action="addurls", status="notneeded") # Add to already existing links works, as long content is the same. ds.addurls(self.json_file, "{url}", "{name}") # But it fails if something has changed. ds.unlock("a") with open("a", "w") as ofh: ofh.write("changed") ds.add("a") assert_raises(IncompleteResultsError, ds.addurls, self.json_file, "{url}", "{name}")
def test_bf2458(src, dst): ds = Dataset(src).create(force=True) ds.add('.', to_git=False) # no clone (empty) into new dst clone = install(source=ds.path, path=dst) # XXX whereis says nothing in direct mode # content is not here eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')]) # check that plain metadata access does not `get` stuff clone.metadata('.', on_failure='ignore') # XXX whereis says nothing in direct mode eq_(clone.repo.whereis('dummy'), [ds.config.get('annex.uuid')])
def check_api(no_annex, path): ds = Dataset(path).create(force=True, no_annex=no_annex) ds.add('.') ok_clean_git(ds.path) processed_extractors, skipped_extractors = [], [] for extractor_ep in iter_entry_points('datalad.metadata.extractors'): # we need to be able to query for metadata, even if there is none # from any extractor try: extractor_cls = extractor_ep.load() except Exception as exc: exc_ = str(exc) skipped_extractors += [exc_] continue extractor = extractor_cls( ds, paths=['file.dat']) meta = extractor.get_metadata( dataset=True, content=True) # we also get something for the dataset and something for the content # even if any of the two is empty assert_equal(len(meta), 2) dsmeta, contentmeta = meta assert (isinstance(dsmeta, dict)) assert hasattr(contentmeta, '__len__') or isgenerator(contentmeta) # verify that generator does not blow and has an entry for our # precious file cm = dict(contentmeta) # datalad_core does provide some (not really) information about our # precious file if extractor_ep.name == 'datalad_core': assert 'file.dat' in cm elif extractor_ep.name == 'annex': if not no_annex: # verify correct key, which is the same for all files of 0 size assert_equal( cm['file.dat']['key'], 'MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.dat' ) else: # no metadata on that file assert not cm processed_extractors.append(extractor_ep.name) assert "datalad_core" in processed_extractors, \ "Should have managed to find at least the core extractor extractor" if skipped_extractors: raise SkipTest( "Not fully tested/succeded since some extractors failed" " to load:\n%s" % ("\n".join(skipped_extractors)))
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy( opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['exif'] for k, v in target.items(): eq_(meta[k], v) assert_in('@context', meta)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) # before anything aggregated we would get nothing and only a log warning with swallow_logs(new_level=logging.WARNING) as cml: assert_equal(list(query_aggregated_metadata('all', ds, [])), []) assert_re_in('.*Found no aggregated metadata.*update', cml.out) ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True, update_mode='all') # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # quick test of aggregate report aggs = ds.metadata(get_aggregates=True) # one for each dataset assert_result_count(aggs, 3) # mother also report layout version assert_result_count(aggs, 1, path=ds.path, layout_version=1) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 6) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 3, type='file') # Now that we have annex.key # three different IDs assert_equal(3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('MOTHER_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([s['metadata']['frictionless_datapackage']['name'] \ == assure_unicode(name) for s in origres if s['type'] == 'dataset'])) # now clone the beast to simulate a new user installing an empty dataset clone = install( opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 2) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 1, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # test search in search tests, not all over the place ## query smoke test assert_result_count(clone.search('mother', mode='egrep'), 1) assert_result_count(clone.search('(?i)MoTHER', mode='egrep'), 1) child_res = clone.search('child', mode='egrep') assert_result_count(child_res, 2) for r in child_res: if r['type'] == 'dataset': assert_in( r['query_matched']['frictionless_datapackage.name'], r['metadata']['frictionless_datapackage']['name'])
class DataladOrchestrator(Orchestrator, metaclass=abc.ABCMeta): """Execute command assuming (at least) a local dataset. """ def __init__(self, resource, submission_type, job_spec=None, resurrection=False): if not external_versions["datalad"]: raise MissingExternalDependency( "DataLad is required for orchestrator '{}'".format(self.name)) super(DataladOrchestrator, self).__init__(resource, submission_type, job_spec, resurrection=resurrection) from datalad.api import Dataset self.ds = Dataset(".") if not self.ds.id: raise OrchestratorError( "orchestrator {} requires a local dataset".format(self.name)) if self._resurrection: self.head = self.job_spec.get("head") else: self._configure_repo() self.head = self.ds.repo.get_hexsha() _datalad_check_container(self.ds, self.job_spec) _datalad_format_command(self.ds, self.job_spec) @property @cached_property @borrowdoc(Orchestrator) def working_directory(self): wdir = self.job_spec.get("working_directory") return wdir or op.join(self.root_directory, self.ds.id) @property @borrowdoc(Orchestrator) def local_directory(self): return self.ds.path @property @cached_property def job_refname(self): return "refs/reproman/{}".format(self.jobid) @borrowdoc(Orchestrator) def as_dict(self): d = super(DataladOrchestrator, self).as_dict() d["dataset_id"] = self.ds.id d["head"] = self.head return d @property def status(self): """Like Orchestrator.status, but inspect the job's git ref if needed. """ status = super(DataladOrchestrator, self).status if status == "unknown": # The local tree might be different because of another just. Check # the ref for the status. status_from_ref = self._execute_in_wdir( "git cat-file -p {}:{}".format( self.job_refname, op.relpath(op.join(self.meta_directory, "status"), self.working_directory))) status = status_from_ref.strip() or status return status def _configure_repo(self): gitignore = op.join(self.ds.path, ".reproman", "jobs", ".gitignore") write_update(gitignore, ("# Automatically created by ReproMan.\n" "# Do not change manually.\n" "log\n")) gitattrs = op.join(self.ds.path, ".reproman", "jobs", ".gitattributes") write_update(gitattrs, ("# Automatically created by ReproMan.\n" "# Do not change manually.\n" "status annex.largefiles=nothing\n" "idmap annex.largefiles=nothing\n")) self.ds.add([gitignore, gitattrs], message="[ReproMan] Configure jobs directory")
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('audio.mp3', opj('stim', 'stim1.mp3')), ): copy(opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio', ): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), 'meta', 'mp3'), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), 'audio.format', 'mp3'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_within_ds_file_search(path): try: import nibabel except ImportError: raise SkipTest ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.nativetype', 'nifti1', where='dataset') makedirs(opj(path, 'stim')) for src, dst in (('nifti1.nii.gz', opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz')), ('nifti1.nii.gz', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'))): copy(opj(dirname(dirname(__file__)), 'tests', 'data', 'files', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('bids', 'nifti1'): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.key bids.BIDSVersion bids.author bids.citation bids.conformsto bids.datatype bids.description """ if external_versions['bids'] >= '0.9': target_out += "bids.extension\n" target_out += """\ bids.fundedby bids.license bids.name bids.subject.age(years) bids.subject.gender bids.subject.handedness bids.subject.hearing_problems_current bids.subject.id bids.subject.language bids.suffix bids.task datalad_core.id datalad_core.refcommit id nifti1.cal_max nifti1.cal_min nifti1.datatype nifti1.description nifti1.dim nifti1.freq_axis nifti1.intent nifti1.magic nifti1.phase_axis nifti1.pixdim nifti1.qform_code nifti1.sform_code nifti1.sizeof_hdr nifti1.slice_axis nifti1.slice_duration nifti1.slice_end nifti1.slice_order nifti1.slice_start nifti1.spatial_resolution(mm) nifti1.t_unit nifti1.temporal_spacing(s) nifti1.toffset nifti1.vox_offset nifti1.xyz_unit parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump # so we will use diff diff = list(unified_diff(target_out.splitlines(), cmo.out.splitlines())) assert_in(target_out, cmo.out, msg="Diff: %s" % os.linesep.join(diff)) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched_key, matched_val in ( # random keyword query # multi word query implies AND ('textblob', ['bold', 'female'], opj('sub-03', 'func', 'sub-03_task-some_bold.nii.gz'), 'meta', 'female'), # report which field matched with auto-field ('autofield', 'female', opj('sub-03', 'func', 'sub-03_task-other_bold.nii.gz'), 'bids.subject.gender', 'female'), # autofield multi-word query is also AND ('autofield', ['bids.suffix:bold', 'bids.subject.id:01'], opj('sub-01', 'func', 'sub-01_task-some_bold.nii.gz'), 'bids.suffix', 'bold'), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) if mode == 'textblob': # 'textblob' does datasets by default only (be could be configured otherwise assert_result_count(res, 1) else: # the rest has always a file and the dataset, because they carry metadata in # the same structure assert_result_count(res, 2) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) assert_result_count(res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
class supers(SuprocBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir) def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path) def time_installr(self, tarfile_path): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_createadd(self, tarfile_path): assert self.ds.create('newsubds') def time_createadd_to_dataset(self, tarfile_path): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self, tarfile_path): ls(self.ds.path) def time_ls_recursive(self, tarfile_path): ls(self.ds.path, recursive=True) def time_ls_recursive_long_all(self, tarfile_path): ls(self.ds.path, recursive=True, long_=True, all_=True) # TODO: since doesn't really allow to uninstall top level ds... bleh ;) #def time_uninstall(self, tarfile_path): # uninstall(self.ds.path, recursive=True) def time_remove(self, tarfile_path): remove(self.ds.path, recursive=True)
class supers(SuprocBenchmarks): """ Benchmarks on common operations on collections of datasets using datalad API """ timeout = 3600 # need to assure that we are working in a different repository now # see https://github.com/datalad/datalad/issues/1512 # might not be sufficient due to side effects between tests and # thus getting into the same situation ds_count = 0 def setup_cache(self): # creating in CWD so things get removed when ASV is done ds_path = create_test_dataset("testds1", spec='2/-2/-2', seed=0)[0] # Will store into a tarfile since otherwise install -r is way too slow # to be invoked for every benchmark tarfile_path = opj(osp.dirname(ds_path), 'testds1.tar') with tarfile.open(tarfile_path, "w") as tar: # F.CK -- Python tarfile can't later extract those because key dirs are # read-only. For now just a workaround - make it all writeable from datalad.utils import rotree rotree('testds1', ro=False, chmod_files=False) tar.add('testds1', recursive=True) rmtree('testds1') return tarfile_path def setup(self, tarfile_path): import tarfile tempdir = osp.dirname(tarfile_path) with tarfile.open(tarfile_path) as tar: tar.extractall(tempdir) # TODO -- remove this abomination after https://github.com/datalad/datalad/issues/1512 is fixed epath = opj(tempdir, 'testds1') epath_unique = epath + str(self.__class__.ds_count) os.rename(epath, epath_unique) self.__class__.ds_count += 1 self.ds = Dataset(epath_unique) print("Finished setup for %s" % tempdir) def teardown(self, tarfile_path): for path in [self.ds.path + '_', self.ds.path]: print("Cleaning up %s" % path) if osp.exists(path): rmtree(path) def time_installr(self, tarfile_path): # somewhat duplicating setup but lazy to do different one for now assert install(self.ds.path + '_', source=self.ds.path, recursive=True) def time_rev_createadd(self, tarfile_path): assert self.ds.rev_create('newsubds') def time_rev_createadd_to_dataset(self, tarfile_path): subds = rev_create(opj(self.ds.path, 'newsubds')) self.ds.rev_save(subds.path) def time_createadd(self, tarfile_path): assert self.ds.create('newsubds') def time_createadd_to_dataset(self, tarfile_path): subds = create(opj(self.ds.path, 'newsubds')) self.ds.add(subds.path) def time_ls(self, tarfile_path): ls(self.ds.path) def time_ls_recursive(self, tarfile_path): ls(self.ds.path, recursive=True) def time_ls_recursive_long_all(self, tarfile_path): ls(self.ds.path, recursive=True, long_=True, all_=True) # TODO: since doesn't really allow to uninstall top level ds... bleh ;) #def time_uninstall(self, tarfile_path): # uninstall(self.ds.path, recursive=True) def time_remove(self, tarfile_path): remove(self.ds.path, recursive=True)
def test_aggregation(path): with chpwd(path): assert_raises(InsufficientArgumentsError, aggregate_metadata, None) # a hierarchy of three (super/sub)datasets, each with some native metadata ds = Dataset(opj(path, 'origin')).create(force=True) ds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.config.add('datalad.metadata.nativetype', 'frictionless_datapackage', where='dataset') subds = ds.create('sub', force=True) subds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') subsubds = subds.create('subsub', force=True) subsubds.config.add('datalad.metadata.nativetype', 'bids', where='dataset') ds.add('.', recursive=True) ok_clean_git(ds.path) # aggregate metadata from all subdatasets into any superdataset, including # intermediate ones res = ds.aggregate_metadata(recursive=True) # we get success report for both subdatasets and the superdataset, # and they get saved assert_result_count(res, 6) assert_result_count(res, 3, status='ok', action='aggregate_metadata') assert_result_count(res, 3, status='ok', action='save') # nice and tidy ok_clean_git(ds.path) # store clean direct result origres = ds.metadata(recursive=True) # basic sanity check assert_result_count(origres, 7) assert_result_count(origres, 3, type='dataset') assert_result_count(origres, 4, type='file') # three different IDs assert_equal( 3, len(set([s['dsid'] for s in origres if s['type'] == 'dataset']))) # and we know about all three datasets for name in ('mother_äöü東', 'child_äöü東', 'grandchild_äöü東'): assert_true( sum([ s['metadata']['name'] == assure_unicode(name) for s in origres if s['type'] == 'dataset' ])) # now clone the beast to simulate a new user installing an empty dataset clone = install(opj(path, 'clone'), source=ds.path, result_xfm='datasets', return_type='item-or-list') # ID mechanism works assert_equal(ds.id, clone.id) # get fresh metadata cloneres = clone.metadata() # basic sanity check assert_result_count(cloneres, 3) assert_result_count(cloneres, 1, type='dataset') assert_result_count(cloneres, 2, type='file') # now loop over the previous results from the direct metadata query of # origin and make sure we get the extact same stuff from the clone _compare_metadata_helper(origres, clone) # now obtain a subdataset in the clone, should make no difference assert_status('ok', clone.install('sub', result_xfm=None, return_type='list')) _compare_metadata_helper(origres, clone) # query smoke test assert_result_count(clone.search('mother*'), 1) assert_result_count(clone.search('MoTHER*'), 1) child_res = clone.search('*child*') assert_result_count(child_res, 2) for r in child_res: if r['metadata']['type'] == 'dataset': eq_(r['query_matched']['name'], r['metadata']['name']) # Test 'and' for multiple search entries assert_result_count(clone.search(['*child*', '*bids*']), 2) assert_result_count(clone.search(['*child*', '*subsub*']), 1) assert_result_count(clone.search(['*bids*', '*sub*']), 2) assert_result_count(clone.search(['*', 'type:dataset']), 3)
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') # yoh: CANNOT FIGURE IT OUT since in direct mode it gets added to git # directly BUT # - output reports key, so seems to be added to annex! # - when I do manually in cmdline - goes to annex ok_file_under_git(path, opj('stim', 'stim1.mp3'), annexed=True) # If it is not under annex, below addition of metadata silently does # not do anything list(ds.repo.set_metadata( opj('stim', 'stim1.mp3'), init={'importance': 'very'})) ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ annex.importance annex.key audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val) # test a suggestion msg being logged if no hits and key is a bit off with swallow_logs(new_level=logging.INFO) as cml: res = ds.search('audio.formats:mp3 audio.bitsrate:1', mode='egrep') assert not res assert_in('Did you mean any of', cml.out) assert_in('audio.format', cml.out) assert_in('audio.bitrate', cml.out)
def add_to_datalad(topdir, studydir, msg, bids): """Do all necessary preparations (if were not done before) and save """ from datalad.api import create from datalad.api import Dataset from datalad.support.annexrepo import AnnexRepo from datalad.support.external_versions import external_versions assert external_versions['datalad'] >= MIN_VERSION, ( "Need datalad >= {}".format(MIN_VERSION)) # add to reqs create_kwargs = {} if external_versions['datalad'] >= '0.10': create_kwargs['fake_dates'] = True # fake dates by default studyrelpath = op.relpath(studydir, topdir) assert not studyrelpath.startswith(op.pardir) # so we are under # now we need to test and initiate a DataLad dataset all along the path curdir_ = topdir superds = None subdirs = [''] + studyrelpath.split(op.sep) for isubdir, subdir in enumerate(subdirs): curdir_ = op.join(curdir_, subdir) ds = Dataset(curdir_) if not ds.is_installed(): lgr.info("Initiating %s", ds) # would require annex > 20161018 for correct operation on annex v6 # need to add .gitattributes first anyways ds_ = create(curdir_, dataset=superds, force=True, no_annex=True, # shared_access='all', annex_version=6, **create_kwargs ) assert ds == ds_ assert ds.is_installed() superds = ds # TODO: we need a helper (in DataLad ideally) to ease adding such # specifications gitattributes_path = op.join(studydir, '.gitattributes') # We will just make sure that all our desired rules are present in it desired_attrs = """\ * annex.largefiles=(largerthan=100kb) *.json annex.largefiles=nothing *.txt annex.largefiles=nothing *.tsv annex.largefiles=nothing *.nii.gz annex.largefiles=anything *.tgz annex.largefiles=anything *_scans.tsv annex.largefiles=anything """ if op.exists(gitattributes_path): with open(gitattributes_path, 'rb') as f: known_attrs = [line.decode('utf-8').rstrip() for line in f.readlines()] else: known_attrs = [] for attr in desired_attrs.split('\n'): if attr not in known_attrs: known_attrs.append(attr) with open(gitattributes_path, 'wb') as f: f.write('\n'.join(known_attrs).encode('utf-8')) # so for mortals it just looks like a regular directory! if not ds.config.get('annex.thin'): ds.config.add('annex.thin', 'true', where='local') # initialize annex there if not yet initialized AnnexRepo(ds.path, init=True) # ds might have memories of having ds.repo GitRepo superds = None del ds ds = Dataset(studydir) # Add doesn't have all the options of save such as msg and supers ds.add('.gitattributes', to_git=True, save=False) dsh = dsh_path = None if op.lexists(op.join(ds.path, '.heudiconv')): dsh_path = op.join(ds.path, '.heudiconv') dsh = Dataset(dsh_path) if not dsh.is_installed(): # Previously we did not have it as a submodule, and since no # automagic migration is implemented, we just need to check first # if any path under .heudiconv is already under git control if any(x[0].startswith('.heudiconv/') for x in ds.repo.repo.index.entries.keys()): lgr.warning("%s has .heudiconv not as a submodule from previous" " versions of heudiconv. No automagic migration is " "yet provided", ds) else: dsh = ds.create(path='.heudiconv', force=True, **create_kwargs # shared_access='all' ) # Since .heudiconv could contain sensitive information # we place all files under annex and then add if create_file_if_missing(op.join(dsh_path, '.gitattributes'), """* annex.largefiles=anything"""): ds.add('.heudiconv/.gitattributes', to_git=True, message="Added gitattributes to place all .heudiconv content" " under annex") ds.add('.', recursive=True, save=False, # not in effect! ? #annex_add_opts=['--include-dotfiles'] ) # TODO: filter for only changed files? # Provide metadata for sensitive information mark_sensitive(ds, 'sourcedata') mark_sensitive(ds, '*_scans.tsv') # top level mark_sensitive(ds, '*/*_scans.tsv') # within subj mark_sensitive(ds, '*/*/*_scans.tsv') # within sess/subj mark_sensitive(ds, '*/anat') # within subj mark_sensitive(ds, '*/*/anat') # within ses/subj if dsh_path: mark_sensitive(ds, '.heudiconv') # entire .heudiconv! ds.save(message=msg, recursive=True, super_datasets=True) assert not ds.repo.dirty # TODO: they are still appearing as native annex symlinked beasts """
class DataladOrchestrator(Orchestrator, metaclass=abc.ABCMeta): """Execute command assuming (at least) a local dataset. """ def __init__(self, resource, submission_type, job_spec=None, resurrection=False): if not external_versions["datalad"]: raise MissingExternalDependency( "DataLad is required for orchestrator '{}'".format(self.name)) super(DataladOrchestrator, self).__init__(resource, submission_type, job_spec, resurrection=resurrection) from datalad.api import Dataset self.ds = Dataset(".") if not self.ds.id: raise OrchestratorError( "orchestrator {} requires a local dataset".format(self.name)) if self._resurrection: self.head = self.job_spec.get("_head") else: if self.ds.repo.dirty: raise OrchestratorError( "Local dataset {} is dirty. " "Save or discard uncommitted changes".format(self.ds.path)) self._configure_repo() self.head = self.ds.repo.get_hexsha() _datalad_check_container(self.ds, self.job_spec) _datalad_format_command(self.ds, self.job_spec) @property @cached_property @borrowdoc(Orchestrator) def working_directory(self): wdir = self.job_spec.get("working_directory") return wdir or op.join(self.root_directory, self.ds.id) @property @borrowdoc(Orchestrator) def local_directory(self): return self.ds.path @property @cached_property def job_refname(self): return "refs/reproman/{}".format(self.jobid) @borrowdoc(Orchestrator) def as_dict(self): d = super(DataladOrchestrator, self).as_dict() d["_dataset_id"] = self.ds.id d["_head"] = self.head return d def _prepare_spec(self): # Disable. _datalad_format_command() and _datalad_format_command() # handle this in __init__(). We can't just call those here because the # self.ds wouldn't be defined yet. pass def _configure_repo(self): gitignore = op.join(self.ds.path, ".reproman", "jobs", ".gitignore") write_update(gitignore, ("# Automatically created by ReproMan.\n" "# Do not change manually.\n" "log.*\n")) gitattrs = op.join(self.ds.path, ".reproman", "jobs", ".gitattributes") write_update(gitattrs, ("# Automatically created by ReproMan.\n" "# Do not change manually.\n" "status.[0-9]* annex.largefiles=nothing\n" "**/failed/* annex.largefiles=nothing\n" "idmap annex.largefiles=nothing\n")) self.ds.add([gitignore, gitattrs], message="[ReproMan] Configure jobs directory")
def test_within_ds_file_search(path): try: import mutagen except ImportError: raise SkipTest ds = Dataset(path).create(force=True) # override default and search for datasets and files for this test for m in ('egrep', 'textblob', 'autofield'): ds.config.add( 'datalad.search.index-{}-documenttype'.format(m), 'all', where='dataset') ds.config.add('datalad.metadata.nativetype', 'audio', where='dataset') makedirs(opj(path, 'stim')) for src, dst in ( ('audio.mp3', opj('stim', 'stim1.mp3')),): copy( opj(dirname(dirname(__file__)), 'tests', 'data', src), opj(path, dst)) ds.add('.') ds.aggregate_metadata() ok_clean_git(ds.path) # basic sanity check on the metadata structure of the dataset dsmeta = ds.metadata('.', reporton='datasets')[0]['metadata'] for src in ('audio',): # something for each one assert_in(src, dsmeta) # each src declares its own context assert_in('@context', dsmeta[src]) # we have a unique content metadata summary for each src assert_in(src, dsmeta['datalad_unique_content_properties']) # test default behavior with swallow_outputs() as cmo: ds.search(show_keys='name', mode='textblob') assert_in("""\ id meta parentds path type """, cmo.out) target_out = """\ audio.bitrate audio.duration(s) audio.format audio.music-Genre audio.music-album audio.music-artist audio.music-channels audio.music-sample_rate audio.name audio.tracknumber datalad_core.id datalad_core.refcommit id parentds path type """ # check generated autofield index keys with swallow_outputs() as cmo: ds.search(mode='autofield', show_keys='name') # it is impossible to assess what is different from that dump assert_in(target_out, cmo.out) assert_result_count(ds.search('blablob#'), 0) # now check that we can discover things from the aggregated metadata for mode, query, hitpath, matched in ( ('egrep', ':mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, leading : is stripped, in indicates "ALL FIELDS" ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # same as above, but with AND condition # get both matches ('egrep', ['mp3', 'type:file'], opj('stim', 'stim1.mp3'), {'type': 'file', 'audio.format': 'mp3'}), # case insensitive search ('egrep', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # field selection by expression ('egrep', 'audio\.+:mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # random keyword query ('textblob', 'mp3', opj('stim', 'stim1.mp3'), {'meta': 'mp3'}), # report which field matched with auto-field ('autofield', 'mp3', opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # XXX next one is not supported by current text field analyser # decomposes the mime type in [mime, audio, mp3] # ('autofield', # "'mime:audio/mp3'", # opj('stim', 'stim1.mp3'), # 'audio.format', 'mime:audio/mp3'), # but this one works ('autofield', "'mime audio mp3'", opj('stim', 'stim1.mp3'), {'audio.format': 'mp3'}), # TODO extend with more complex queries to test whoosh # query language configuration ): res = ds.search(query, mode=mode, full_record=True) assert_result_count( res, 1, type='file', path=opj(ds.path, hitpath), # each file must report the ID of the dataset it is from, critical for # discovering related content dsid=ds.id) # in egrep we currently do not search unique values # and the queries above aim at files assert_result_count(res, 1 if mode == 'egrep' else 2) if mode != 'egrep': assert_result_count( res, 1, type='dataset', path=ds.path, dsid=ds.id) # test the key and specific value of the match for matched_key, matched_val in matched.items(): assert_in(matched_key, res[-1]['query_matched']) assert_equal(res[-1]['query_matched'][matched_key], matched_val)
def test_exif(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'exif', where='dataset') copy(opj(dirname(dirname(dirname(__file__))), 'tests', 'data', 'exif.jpg'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) res = ds.metadata('exif.jpg') assert_result_count(res, 1) # compare full expected metadata set to catch any change of mind on the # side of the EXIF library assert_result_count( res, 1, metadata={ "exif:InteroperabilityVersion": "[48, 49, 48, 48]", "exif:ExifVersion": 221.0, "exif:FocalLengthIn35mmFilm": 38.0, "exif:CompressedBitsPerPixel": 5.0, "exif:GainControl": "None", "exif:Compression": "JPEG (old-style)", "exif:PrintIM": "[80, 114, 105, 110, 116, 73, 77, 0, 48, 51, 48, 48, 0, 0, 0, 5, 0, 1, 0, 22, 0, 22, 0, 2, 1, 0, 0, 0, 1, 0, 5, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 16, 131, 0, 0, 0]", "exif:Make": "CASIO COMPUTER CO.,LTD.", "exif:Sharpness": "Normal", "exif:Contrast": "Normal", "exif:ColorSpace": "sRGB", "exif:ExposureMode": "Auto Exposure", "exif:ExposureBiasValue": 0.0, "exif:ExifImageWidth": 4.0, "exif:ComponentsConfiguration": "YCbCr", "exif:DateTimeOriginal": "2011:03:13 16:36:02", "exif:MaxApertureValue": "14/5", "exif:DateTime": "2017:10:08 10:21:03", "exif:InteroperabilityOffset": 30412.0, "exif:InteroperabilityIndex": "R98", "exif:FileSource": "Digital Camera", "exif:ResolutionUnit": "Pixels/Inch", "exif:FNumber": "27/10", "exif:ExposureProgram": "Program Normal", "exif:DigitalZoomRatio": "0/0", "exif:LightSource": "Unknown", "exif:ExifImageLength": 3.0, "exif:FlashPixVersion": 100.0, "exif:CustomRendered": "Normal", "exif:Flash": "Flash fired, auto mode", "exif:WhiteBalance": "Auto", "exif:Orientation": "Horizontal (normal)", "exif:ExposureTime": "1/60", "exif:Software": "GIMP 2.8.20", "exif:Model": "EX-S600", "exif:FocalLength": "31/5", "exif:SceneCaptureType": "Standard", "exif:ExifOffset": 272.0, "exif:Saturation": "Normal", "exif:YCbCrPositioning": "Centered", "exif:DateTimeDigitized": "2011:03:13 16:36:02", "exif:XResolution": 72.0, "exif:YResolution": 72.0, "exif:MeteringMode": "Pattern", })
def test_dicom(path): ds = Dataset(path).create() ds.config.add('datalad.metadata.nativetype', 'dicom', where='dataset') copy( op.join(op.dirname(op.dirname(op.dirname(__file__))), 'tests', 'data', 'files', 'dicom.dcm'), path) ds.add('.') ok_clean_git(ds.path) res = ds.aggregate_metadata() assert_status('ok', res) # query for the file metadata res = ds.metadata('dicom.dcm') assert_result_count(res, 1) # from this extractor meta = res[0]['metadata']['dicom'] assert_in('@context', meta) # no point in testing ALL keys, but we got plenty assert(len(meta.keys()) > 70) eq_(meta['SeriesDate'], '20070205') # Actually a tricky one of the dcm.multival.MultiValue type # which we should extract as a list # https://github.com/datalad/datalad-neuroimaging/issues/49 eq_(meta['ImageType'], ['ORIGINAL', 'PRIMARY', 'EPI', 'NONE']) # make sure we have PatientName -- this is not using a basic data type, but # dicom.valuerep.PersonName3 -- conversion should have handled that # we can only test if the key is there, the source dicom has an empty # string as value eq_(meta['PatientName'], '') # now ask for the dataset metadata, which should have both the unique props # and a list of imageseries (one in this case, but a list) res = ds.metadata(reporton='datasets') assert_result_count(res, 1) dsmeta = res[0]['metadata']['dicom'] # same context assert_dict_equal(meta['@context'], dsmeta['@context']) meta.pop('@context') seriesmeta = dsmeta['Series'] eq_(seriesmeta[0].pop('SeriesDirectory'), op.curdir) eq_(dsmeta['Series'], [meta]) # for this artificial case pretty much the same info also comes out as # unique props, but wrapped in lists ucp = res[0]['metadata']["datalad_unique_content_properties"]['dicom'] assert_dict_equal( {k: [v] for k, v in dsmeta['Series'][0].items() if k not in DicomExtractor._unique_exclude and k in ucp}, {k: v for k, v in ucp.items() if k not in DicomExtractor._unique_exclude}) # buuuut, if we switch of file-based metadata storage ds.config.add('datalad.metadata.aggregate-content-dicom', 'false', where='dataset') ds.aggregate_metadata() res = ds.metadata(reporton='datasets') if not datalad_extracts_annex_key: # the auto-uniquified bits are gone but the Series description stays assert_not_in("datalad_unique_content_properties", res[0]['metadata']) eq_(dsmeta['Series'], [meta])