def test_ds_extraction(path): ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) # by default we get core and annex reports res = meta_extract(dataset=ds) # dataset, plus two files (payload) assert_result_count(res, 3) assert_result_count(res, 1, type='dataset') assert_result_count(res, 2, type='file') # core has stuff on everythin assert (all('metalad_core' in r['metadata'] for r in res)) # now for specific extractor request res = meta_extract( sources=['metalad_custom'], dataset=ds, # artificially disable extraction from any file in the dataset path=[]) assert_result_count(res, 1, type='dataset', status='ok', action='meta_extract', path=path, refds=ds.path) assert_in('metalad_custom', res[0]['metadata']) # now the more useful case: getting everthing for 'metalad_custom' from a dataset res = meta_extract(sources=['metalad_custom'], dataset=ds) assert_result_count(res, 2) assert_result_count(res, 1, type='dataset', status='ok', action='meta_extract', path=path, refds=ds.path) assert_result_count(res, 1, type='file', status='ok', action='meta_extract', path=op.join(path, 'sub', 'one'), parentds=ds.path) for r in res: assert_in('metalad_custom', r['metadata']) # and lastly, if we disable extraction via config, we get nothing ds.config.add('datalad.metadata.extract-from-metalad-custom', 'dataset', where='dataset') assert_result_count(meta_extract(sources=['metalad_custom'], dataset=ds), 1)
def test_dataset_extraction_result(path): ds = Dataset(path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) extractor_name = "metalad_core_dataset" extractor_class = get_extractor_class(extractor_name) extractor_version = extractor_class(None, None, None).get_version() res = meta_extract(extractorname=extractor_name, dataset=ds) assert_result_count(res, 1) assert_result_count(res, 1, type='dataset') assert_result_count(res, 0, type='file') metadata_record = res[0]["metadata_record"] _check_metadata_record(metadata_record=metadata_record, dataset=ds, extractor_name=extractor_name, extractor_version=extractor_version, extraction_parameter={}) extracted_metadata = metadata_record["extracted_metadata"] eq_(extracted_metadata["id"], ds.id) eq_(extracted_metadata["refcommit"], ds.repo.get_hexsha()) eq_(extracted_metadata["dataset_path"], "") eq_(extracted_metadata["comment"], "test-implementation of core_dataset")
def test_legacy2_file_extraction_result(ds_path): ds = Dataset(ds_path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) file_path = "sub/one" extractor_name = "datalad_core" extractor_version = "un-versioned" res = meta_extract(extractorname=extractor_name, path=file_path, dataset=ds) assert_result_count(res, 1) assert_result_count(res, 1, type='file') assert_result_count(res, 0, type='dataset') metadata_record = res[0]["metadata_record"] _check_metadata_record(metadata_record=metadata_record, dataset=ds, extractor_name=extractor_name, extractor_version=extractor_version, extraction_parameter={}, path=file_path) extracted_metadata = metadata_record["extracted_metadata"] eq_(extracted_metadata, {})
def test_legacy1_dataset_extraction_result(ds_path): ds = Dataset(ds_path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) extractor_name = "metalad_core" extractor_version = "1" res = meta_extract(extractorname=extractor_name, dataset=ds) assert_result_count(res, 1) assert_result_count(res, 1, type='dataset') assert_result_count(res, 0, type='file') metadata_record = res[0]["metadata_record"] _check_metadata_record(metadata_record=metadata_record, dataset=ds, extractor_name=extractor_name, extractor_version=extractor_version, extraction_parameter={}) extracted_metadata = metadata_record["extracted_metadata"] assert_in("@context", extracted_metadata) assert_in("@graph", extracted_metadata) eq_(len(extracted_metadata["@graph"]), 2)
def test_path_parameter_recognition(ds_path): ds = Dataset(ds_path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) with patch("datalad_metalad.extract.do_file_extraction") as fe, \ patch("datalad_metalad.extract.do_dataset_extraction") as de: meta_extract(extractorname="metalad_core_file", dataset=ds, path="sub/one") eq_(fe.call_count, 1) eq_(de.call_count, 0)
def test_file_extraction(path): # go into virgin dir to avoid detection of any dataset testpath = op.join(path, 'sub', 'one') with chpwd(path): res = meta_extract(sources=['metalad_custom'], path=[testpath]) assert_result_count(res, 1, type='file', status='ok', action='meta_extract', path=testpath) assert_in('metalad_custom', res[0]['metadata']) eq_(res[0]['metadata']['metalad_custom'], sample_fmeta)
def test_plainest(path): # blow on nothing assert_raises(ValueError, meta_extract, dataset=path, sources=['metalad_core']) r = GitRepo(path, create=True) # proper error, no crash, when there is the thinnest of all traces # of a dataset: but nothing to describe assert_result_count( meta_extract( dataset=r.path, on_failure='ignore', ), 1, status='error', # message contains exception type='dataset', path=r.path, ) # not we add some dummy content that does not count as metadata-relevant # and we still fail (r.pathobj / '.datalad').mkdir() (r.pathobj / '.datalad' / 'dummy').write_text(text_type('stamp')) ds = Dataset(r.path) ds.save() assert_result_count( meta_extract( dataset=ds.path, sources=['metalad_core'], on_failure='ignore', ), 1, status='error', # message contains exception type='dataset', path=ds.path, )
def test_path_and_extra_parameter_recognition(ds_path): ds = Dataset(ds_path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) with patch("datalad_metalad.extract.do_file_extraction") as fe, \ patch("datalad_metalad.extract.do_dataset_extraction") as de: meta_extract(extractorname="metalad_core_file", dataset=ds, path="sub/one", extractorargs=["k1", "v1", "k2", "v2", "k3", "v3"]) eq_(de.call_count, 0) eq_(fe.call_count, 1) eq_(fe.call_args_list[0][0][0].extractor_arguments, { "k1": "v1", "k2": "v2", "k3": "v3" })
def test_report(path, orig): origds, subds = make_ds_hierarchy_with_metadata(orig) # now clone to a new place to ensure no content is present ds = install(source=origds.path, path=path) # only dataset-global metadata res = meta_extract(dataset=ds, process_type='dataset') assert_result_count(res, 1) core_dsmeta = _get_dsmeta_from_core_metadata( res[0]['metadata']['metalad_core']) assert_in( { '@type': 'Dataset', '@id': 'datalad:{}'.format(subds.repo.get_hexsha()), 'identifier': 'datalad:{}'.format(subds.id), 'name': 'sub' }, core_dsmeta['hasPart']) # has not seen the content assert_not_in('contentbytesize', core_dsmeta) res = meta_extract(dataset=ds, process_type='content') assert (any( dict(tag=['one', 'two']) == r['metadata'].get('metalad_annex', None) for r in res)) # we have a report on file(s) assert (len(res) > 0) # but no subdataset reports assert_result_count(res, 0, type='dataset') content_size = sum( (_get_dsmeta_from_core_metadata(r['metadata']['metalad_core']) if r['type'] == 'dataset' else r['metadata']['metalad_core'] )['contentbytesize'] for r in res) # and now all together res = meta_extract(dataset=ds, process_type='all') # got a content size report that sums up all individual sizes eq_((_get_dsmeta_from_core_metadata(res[0]['metadata']['metalad_core']) if res[0]['type'] == 'dataset' else res[0]['metadata']['metalad_core'])['contentbytesize'], content_size)
def check_api(no_annex, path): ds = Dataset(path).create(force=True, no_annex=no_annex) ds.save() assert_repo_status(ds.path) processed_extractors, skipped_extractors = [], [] for extractor_ep in iter_entry_points('datalad.metadata.extractors'): # we need to be able to query for metadata, even if there is none # from any extractor try: res = meta_extract( dataset=ds, sources=[extractor_ep.name], ) except Exception as exc: exc_ = str(exc) skipped_extractors += [exc_] continue # metalad_core does provide some information about our precious file if extractor_ep.name == 'metalad_core': assert_result_count( res, 1, path=ds.path, type='dataset', status='ok', ) assert_true( all('metalad_core' in r.get('metadata', {}) for r in res)) # every single report comes with an identifier assert_true(all( (_get_dsid_from_core_metadata(r['metadata']['metalad_core']) if r.get('type', None) == 'dataset' else r['metadata']['metalad_core'].get('@id', None)) \ is not None for r in res)) processed_extractors.append(extractor_ep.name) assert "metalad_core" in processed_extractors, \ "Should have managed to find at least the core extractor extractor" if skipped_extractors: raise SkipTest("Not fully tested/succeded since some extractors failed" " to load:\n%s" % ("\n".join(skipped_extractors)))
def test_file_extraction_result(ds_path): ds = Dataset(ds_path).create(force=True) ds.config.add('datalad.metadata.exclude-path', '.metadata', where='dataset') ds.save() assert_repo_status(ds.path) file_path = "sub/one" extractor_name = "metalad_core_file" extractor_class = get_extractor_class(extractor_name) extractor_version = extractor_class(None, None, None).get_version() res = meta_extract(extractorname=extractor_name, path=file_path, dataset=ds) assert_result_count(res, 1) assert_result_count(res, 1, type='file') assert_result_count(res, 0, type='dataset') metadata_record = res[0]["metadata_record"] _check_metadata_record(metadata_record=metadata_record, dataset=ds, extractor_name=extractor_name, extractor_version=extractor_version, extraction_parameter={}, path=file_path) extracted_metadata = metadata_record["extracted_metadata"] assert_in("content_byte_size", extracted_metadata) assert_in("@id", extracted_metadata) eq_(extracted_metadata["type"], "file") eq_(extracted_metadata["path"], file_path) eq_(extracted_metadata["comment"], "test-implementation of core_file")