def test_dedup(): ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) # make sure ds0 has duplicate C nodes with equivalent data assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc ds = SimpleDocNav(dedup_lineage(ds0)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc # again but with raw doc ds = SimpleDocNav(dedup_lineage(ds0.doc)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources['ac'].sources['cd'].doc # Test that we detect inconsistent metadata for duplicate entries ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) ds0.sources['ac'].doc['label'] = 'Modified' ds0 = SimpleDocNav(ds0.doc) assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'): dedup_lineage(ds0) # Test that we detect inconsistent lineage subtrees for duplicate entries # Subtest 1: different set of keys ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd'] = {} ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 2: different values for "child" nodes ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be' ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 3: different name for child ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['CD'] = srcs['cd'] del srcs['cd'] ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0)
def check_with_existing_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Add nodes BCE(D) with auto-matching, then add node A with product restricted to A only. """ ds = SimpleDocNav(gen_dataset_test_dag(33, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ab', 'ac', 'ae')] prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(ds.doc), }) clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.get(ds.sources['ae'].id) is not None assert index.datasets.get(ds.sources['ab'].id) is not None assert index.datasets.get(ds.sources['ac'].id) is not None clirunner([ 'dataset', 'add', '--no-auto-add-lineage', '--product', 'A', str(prefix / 'main.yml') ]) assert index.datasets.get(ds.id) is not None
def check_missing_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Use --no-auto-add-lineage """ ds = SimpleDocNav(gen_dataset_test_dag(44, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ae', 'ab', 'ac')] prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(ds.doc), }) r = clirunner( ['dataset', 'add', '--no-auto-add-lineage', str(prefix / 'main.yml')]) assert 'ERROR Following lineage datasets are missing' in r.output assert index.datasets.has(ds.id) is False # now add lineage and try again clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.has(ds.sources['ae'].id) r = clirunner( ['dataset', 'add', '--no-auto-add-lineage', str(prefix / 'main.yml')]) assert index.datasets.has(ds.id)
def check_no_product_match(clirunner, index): ds = SimpleDocNav(gen_dataset_test_dag(22, force_tree=True)) prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)}) r = clirunner(['dataset', 'add', '--product', 'A', str(prefix)]) assert 'ERROR Dataset metadata did not match product signature' in r.output r = clirunner(['dataset', 'add', '--product', 'A', '--product', 'B', str(prefix)]) assert 'ERROR No matching Product found for dataset' in r.output ds_ = index.datasets.get(ds.id, include_sources=True) assert ds_ is None # Ignore lineage but fail to match main dataset r = clirunner(['dataset', 'add', '--product', 'B', '--confirm-ignore-lineage', str(prefix)]) assert 'ERROR Dataset metadata did not match product signature' in r.output assert index.datasets.has(ds.id) is False
def test_dataset_add(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty r = clirunner(['dataset', 'add', p.datasets], expect_success=False) assert r.exit_code != 0 assert 'Found no products' in r.output clirunner(['metadata_type', 'add', p.metadata]) clirunner(['product', 'add', p.products]) clirunner(['dataset', 'add', p.datasets]) clirunner(['dataset', 'add', p.datasets_bad1]) ds = load_dataset_definition(p.datasets) ds_bad1 = load_dataset_definition(p.datasets_bad1) r = clirunner(['dataset', 'search'], expect_success=True) assert ds.id in r.output assert ds_bad1.id not in r.output assert ds.sources['ab'].id in r.output assert ds.sources['ac'].sources['cd'].id in r.output r = clirunner(['dataset', 'info', '-f', 'csv', ds.id]) assert ds.id in r.output r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id]) assert ds.sources['ae'].id in r.output r = clirunner([ 'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id ]) assert ds.id in r.output ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) assert ds_.id == ds.id x = index.datasets.get(ds.id, include_sources=True) assert str(x.sources['ab'].id) == ds.sources['ab'].id assert str( x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id check_skip_lineage_test(clirunner, index) check_no_product_match(clirunner, index) check_with_existing_lineage(clirunner, index) check_inconsistent_lineage(clirunner, index) check_missing_metadata_doc(clirunner) check_missing_lineage(clirunner, index) check_no_confirm(clirunner, p.datasets) check_bad_yaml(clirunner, index) # check --product=nosuchproduct r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets], expect_success=False) assert "ERROR Supplied product name" in r.output assert r.exit_code != 0 # Check that deprecated option is accepted r = clirunner(['dataset', 'add', '--auto-match', p.datasets]) assert 'WARNING --auto-match option is deprecated' in r.output
def test_remap_lineage_doc(): def mk_node(ds, sources): return dict(id=ds.id, **sources) ds = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) xx = remap_lineage_doc(ds, mk_node) assert xx['id'] == ds.id assert xx['ac']['id'] == ds.sources['ac'].id xx = remap_lineage_doc(ds.doc, mk_node) assert xx['id'] == ds.id assert xx['ac']['id'] == ds.sources['ac'].id
def check_skip_lineage_test(clirunner, index): ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True)) prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)}) clirunner(['dataset', 'add', '--confirm-ignore-lineage', '--product', 'A', str(prefix)]) ds_ = index.datasets.get(ds.id, include_sources=True) assert ds_ is not None assert str(ds_.id) == ds.id assert ds_.sources == {} assert index.datasets.get(ds.sources['ab'].id) is None assert index.datasets.get(ds.sources['ac'].id) is None assert index.datasets.get(ds.sources['ae'].id) is None assert index.datasets.get(ds.sources['ac'].sources['cd'].id) is None
def check_inconsistent_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Add node E, then try adding A with modified E in the lineage, should fail to add ABCD """ ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ae', )] modified_doc = toolz.assoc_in( ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified') prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(modified_doc), }) clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.get(ds.sources['ae'].id) is not None r = clirunner(['dataset', 'add', str(prefix / 'main.yml')]) assert 'ERROR Inconsistent lineage dataset' in r.output assert index.datasets.has(ds.id) is False assert index.datasets.has(ds.sources['ab'].id) is False assert index.datasets.has(ds.sources['ac'].id) is False assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False # now again but skipping verification check r = clirunner( ['dataset', 'add', '--no-verify-lineage', str(prefix / 'main.yml')]) assert index.datasets.has(ds.id) assert index.datasets.has(ds.sources['ab'].id) assert index.datasets.has(ds.sources['ac'].id) assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
def test_dataset_add(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty r = clirunner(['dataset', 'add', p.datasets], expect_success=False) assert r.exit_code != 0 assert 'Found no products' in r.output clirunner(['metadata', 'add', p.metadata]) clirunner(['product', 'add', p.products]) clirunner(['dataset', 'add', p.datasets]) clirunner(['dataset', 'add', p.datasets_bad1]) clirunner(['dataset', 'add', p.datasets_eo3]) ds = load_dataset_definition(p.datasets) ds_bad1 = load_dataset_definition(p.datasets_bad1) # Check .hl.Doc2Dataset doc2ds = Doc2Dataset(index) _ds, _err = doc2ds(ds.doc, 'file:///something') assert _err is None assert str(_ds.id) == ds.id assert _ds.metadata_doc == ds.doc # Check dataset search r = clirunner(['dataset', 'search'], expect_success=True) assert ds.id in r.output assert ds_bad1.id not in r.output assert ds.sources['ab'].id in r.output assert ds.sources['ac'].sources['cd'].id in r.output r = clirunner(['dataset', 'info', '-f', 'csv', ds.id]) assert ds.id in r.output r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id]) assert ds.sources['ae'].id in r.output r = clirunner([ 'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id ]) assert ds.id in r.output ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) assert ds_.id == ds.id x = index.datasets.get(ds.id, include_sources=True) assert str(x.sources['ab'].id) == ds.sources['ab'].id assert str( x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id check_skip_lineage_test(clirunner, index) check_no_product_match(clirunner, index) check_with_existing_lineage(clirunner, index) check_inconsistent_lineage(clirunner, index) check_missing_metadata_doc(clirunner) check_missing_lineage(clirunner, index) check_no_confirm(clirunner, p.datasets) check_bad_yaml(clirunner, index) # check --product=nosuchproduct r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets], expect_success=False) assert "ERROR Supplied product name" in r.output assert r.exit_code != 0 # Check that deprecated option is accepted r = clirunner(['dataset', 'add', '--auto-match', p.datasets]) assert 'WARNING --auto-match option is deprecated' in r.output # test dataset add eo3 r = clirunner(['dataset', 'add', p.datasets_eo3]) assert r.exit_code == 0 ds_eo3 = load_dataset_definition(p.datasets_eo3) assert ds_eo3.location is not None _ds = index.datasets.get(ds_eo3.id, include_sources=True) assert sorted(_ds.sources) == ['a', 'bc1', 'bc2'] assert _ds.crs == 'EPSG:3857' assert _ds.extent is not None assert _ds.extent.crs == _ds.crs assert _ds.uris == [ds_eo3.location] assert 'location' not in _ds.metadata_doc