def __call__(self, doc, uri): """Attempt to construct dataset from metadata document and a uri. :param doc: Dictionary or SimpleDocNav object :param uri: String "location" property of the Dataset :return: (dataset, None) is successful, :return: (None, ErrorMessage) on failure """ if not isinstance(doc, SimpleDocNav): doc = SimpleDocNav(doc) if self._eo3: auto_skip = self._eo3 == 'auto' doc = SimpleDocNav(prep_eo3(doc.doc, auto_skip=auto_skip)) dataset, err = self._ds_resolve(doc, uri) if dataset is None: return None, err is_consistent, reason = check_dataset_consistent(dataset) if not is_consistent: return None, reason return dataset, None
def check_missing_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Use --no-auto-add-lineage """ ds = SimpleDocNav(gen_dataset_test_dag(44, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ae', 'ab', 'ac')] prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(ds.doc), }) r = clirunner( ['dataset', 'add', '--no-auto-add-lineage', str(prefix / 'main.yml')]) assert 'ERROR Following lineage datasets are missing' in r.output assert index.datasets.has(ds.id) is False # now add lineage and try again clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.has(ds.sources['ae'].id) r = clirunner( ['dataset', 'add', '--no-auto-add-lineage', str(prefix / 'main.yml')]) assert index.datasets.has(ds.id)
def check_with_existing_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Add nodes BCE(D) with auto-matching, then add node A with product restricted to A only. """ ds = SimpleDocNav(gen_dataset_test_dag(33, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ab', 'ac', 'ae')] prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(ds.doc), }) clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.get(ds.sources['ae'].id) is not None assert index.datasets.get(ds.sources['ab'].id) is not None assert index.datasets.get(ds.sources['ac'].id) is not None clirunner([ 'dataset', 'add', '--no-auto-add-lineage', '--product', 'A', str(prefix / 'main.yml') ]) assert index.datasets.get(ds.id) is not None
def doc_path_stream(files, on_error, uri=True): for fname in files: try: for p, doc in read_documents(fname, uri=uri): yield p, SimpleDocNav(doc) except InvalidDocException as e: on_error(fname, e)
def load_dataset_definition(path): if not isinstance(path, pathlib.Path): path = pathlib.Path(path) fname = get_metadata_path(path) for _, doc in read_documents(fname): return SimpleDocNav(doc)
def check_no_product_match(clirunner, index): ds = SimpleDocNav(gen_dataset_test_dag(22, force_tree=True)) prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)}) r = clirunner(['dataset', 'add', '--product', 'A', str(prefix)]) assert 'ERROR Dataset metadata did not match product signature' in r.output r = clirunner(['dataset', 'add', '--product', 'A', '--product', 'B', str(prefix)]) assert 'ERROR No matching Product found for dataset' in r.output ds_ = index.datasets.get(ds.id, include_sources=True) assert ds_ is None # Ignore lineage but fail to match main dataset r = clirunner(['dataset', 'add', '--product', 'B', '--confirm-ignore-lineage', str(prefix)]) assert 'ERROR Dataset metadata did not match product signature' in r.output assert index.datasets.has(ds.id) is False
def test_dataset_add(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty r = clirunner(['dataset', 'add', p.datasets], expect_success=False) assert r.exit_code != 0 assert 'Found no products' in r.output clirunner(['metadata_type', 'add', p.metadata]) clirunner(['product', 'add', p.products]) clirunner(['dataset', 'add', p.datasets]) clirunner(['dataset', 'add', p.datasets_bad1]) ds = load_dataset_definition(p.datasets) ds_bad1 = load_dataset_definition(p.datasets_bad1) r = clirunner(['dataset', 'search'], expect_success=True) assert ds.id in r.output assert ds_bad1.id not in r.output assert ds.sources['ab'].id in r.output assert ds.sources['ac'].sources['cd'].id in r.output r = clirunner(['dataset', 'info', '-f', 'csv', ds.id]) assert ds.id in r.output r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id]) assert ds.sources['ae'].id in r.output r = clirunner([ 'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id ]) assert ds.id in r.output ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) assert ds_.id == ds.id x = index.datasets.get(ds.id, include_sources=True) assert str(x.sources['ab'].id) == ds.sources['ab'].id assert str( x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id check_skip_lineage_test(clirunner, index) check_no_product_match(clirunner, index) check_with_existing_lineage(clirunner, index) check_inconsistent_lineage(clirunner, index) check_missing_metadata_doc(clirunner) check_missing_lineage(clirunner, index) check_no_confirm(clirunner, p.datasets) check_bad_yaml(clirunner, index) # check --product=nosuchproduct r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets], expect_success=False) assert "ERROR Supplied product name" in r.output assert r.exit_code != 0 # Check that deprecated option is accepted r = clirunner(['dataset', 'add', '--auto-match', p.datasets]) assert 'WARNING --auto-match option is deprecated' in r.output
def resolve(main_ds, uri): try: main_ds = SimpleDocNav(dedup_lineage(main_ds)) except InvalidDocException as e: return None, e main_uuid = main_ds.id ds_by_uuid = toolz.valmap(toolz.first, flatten_datasets(main_ds)) all_uuid = list(ds_by_uuid) db_dss = {str(ds.id): ds for ds in index.datasets.bulk_get(all_uuid)} lineage_uuids = set(filter(lambda x: x != main_uuid, all_uuid)) missing_lineage = lineage_uuids - set(db_dss) if missing_lineage and fail_on_missing_lineage: return None, "Following lineage datasets are missing from DB: %s" % (','.join(missing_lineage)) if verify_lineage and not is_doc_eo3(main_ds.doc): bad_lineage = [] for uuid in lineage_uuids: if uuid in db_dss: ok, err = check_consistent(jsonify_document(ds_by_uuid[uuid].doc_without_lineage_sources), db_dss[uuid].metadata_doc) if not ok: bad_lineage.append((uuid, err)) if len(bad_lineage) > 0: error_report = '\n'.join('Inconsistent lineage dataset {}:\n> {}'.format(uuid, err) for uuid, err in bad_lineage) return None, error_report def with_cache(v, k, cache): cache[k] = v return v def resolve_ds(ds, sources, cache=None): cached = cache.get(ds.id) if cached is not None: return cached uris = [uri] if ds.id == main_uuid else [] doc = ds.doc db_ds = db_dss.get(ds.id) if db_ds: product = db_ds.type else: product = match_product(doc) return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache) try: return remap_lineage_doc(main_ds, resolve_ds, cache={}), None except BadMatch as e: return None, e
def test_dataset_add_ambgious_products(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty dss = [SimpleDocNav(dataset_maker(i)( 'A', product_type='eo', flag_a='a', flag_b='b')) for i in [1, 2]] prefix = write_files({ 'products.yml': ''' name: A description: test product A metadata_type: minimal metadata: product_type: eo flag_a: a --- name: B description: test product B metadata_type: minimal metadata: product_type: eo flag_b: b ''', 'dataset1.yml': yaml.safe_dump(dss[0].doc), 'dataset2.yml': yaml.safe_dump(dss[1].doc), }) clirunner(['metadata', 'add', p.metadata]) clirunner(['product', 'add', str(prefix / 'products.yml')]) pp = list(index.products.get_all()) assert len(pp) == 2 for ds, i in zip(dss, (1, 2)): r = clirunner(['dataset', 'add', str(prefix / ('dataset%d.yml' % i))]) assert 'ERROR Auto match failed' in r.output assert 'matches several products' in r.output assert index.datasets.has(ds.id) is False # check that forcing product works ds, fname = dss[0], 'dataset1.yml' r = clirunner(['dataset', 'add', '--product', 'A', str(prefix / fname)]) assert index.datasets.has(ds.id) is True # check that forcing via exclude works ds, fname = dss[1], 'dataset2.yml' r = clirunner(['dataset', 'add', '--exclude-product', 'B', str(prefix / fname)]) assert index.datasets.has(ds.id) is True
def dedup_lineage(root): """Find duplicate nodes in the lineage tree and replace them with references. Will raise `ValueError` when duplicate dataset (same uuid, but different path from root) has either conflicting metadata or conflicting lineage data. :param dict|SimpleDocNav root: Returns a new document that has the same structure as input document, but with duplicate entries now being aliases rather than copies. """ def check_sources(a, b): """ True if two dictionaries contain same objects under the same names. same, not just equivalent. """ if len(a) != len(b): return False for ((ak, av), (bk, bv)) in zip(sorted_items(a), sorted_items(b)): if ak != bk: return False if av is not bv: return False return True def mk_node(ds, sources, cache, sources_path): existing = cache.get(ds.id, None) doc = ds.doc_without_lineage_sources if existing is not None: _ds, _doc, _sources = existing if not check_sources(sources, _sources): raise InvalidDocException( 'Inconsistent lineage for repeated dataset with _id: {}'. format(ds.id)) if doc != _doc: raise InvalidDocException( 'Inconsistent metadata for repeated dataset with _id: {}'. format(ds.id)) return _ds out_ds = toolz.assoc_in(doc, sources_path, sources) cache[ds.id] = (out_ds, doc, sources) return out_ds if not isinstance(root, SimpleDocNav): root = SimpleDocNav(root) return remap_lineage_doc(root, mk_node, cache={}, sources_path=root.sources_path)
def remap_lineage_doc(root, mk_node, **kwargs): def visit(ds): return mk_node(ds, {k: visit(v) for k, v in sorted_items(ds.sources)}, **kwargs) if not isinstance(root, SimpleDocNav): root = SimpleDocNav(root) return visit(root)
def test_dataset_maker(): mk = dataset_maker(0) assert mk('aa') == mk('aa') a = SimpleDocNav(mk('A')) b = SimpleDocNav(mk('B')) assert a.id != b.id assert a.doc['creation_dt'] == b.doc['creation_dt'] assert isinstance(a.id, str) assert a.sources == {} a1, a2 = [dataset_maker(i)('A', product_type='eo') for i in (0, 1)] assert a1['id'] != a2['id'] assert a1['creation_dt'] != a2['creation_dt'] assert a1['product_type'] == 'eo' c = SimpleDocNav(mk('C', sources=dict(a=a.doc, b=b.doc))) assert c.sources['a'].doc is a.doc assert c.sources['b'].doc is b.doc
def test_remap_lineage_doc(): def mk_node(ds, sources): return dict(id=ds.id, **sources) ds = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) xx = remap_lineage_doc(ds, mk_node) assert xx['id'] == ds.id assert xx['ac']['id'] == ds.sources['ac'].id xx = remap_lineage_doc(ds.doc, mk_node) assert xx['id'] == ds.id assert xx['ac']['id'] == ds.sources['ac'].id
def mk_dataset(ds, uri): uuid = ds.id if uuid is None: return None, None, "Metadata document it missing id field" existing = index.datasets.get(uuid) if existing is None: return None, None, "No such dataset in the database: {}".format(uuid) ds = SimpleDocNav(prep_eo3(ds.doc, auto_skip=True)) # TODO: what about sources=? return Dataset(existing.type, ds.doc_without_lineage_sources, uris=[uri]), existing, None
def check_skip_lineage_test(clirunner, index): ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True)) prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)}) clirunner(['dataset', 'add', '--confirm-ignore-lineage', '--product', 'A', str(prefix)]) ds_ = index.datasets.get(ds.id, include_sources=True) assert ds_ is not None assert str(ds_.id) == ds.id assert ds_.sources == {} assert index.datasets.get(ds.sources['ab'].id) is None assert index.datasets.get(ds.sources['ac'].id) is None assert index.datasets.get(ds.sources['ae'].id) is None assert index.datasets.get(ds.sources['ac'].sources['cd'].id) is None
def test_dataset_add_ambgious_products(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty mk = dataset_maker(0) ds = SimpleDocNav(mk('A', product_type='eo', flag_a='a', flag_b='b')) prefix = write_files({ 'products.yml': ''' name: A description: test product A metadata_type: minimal metadata: product_type: eo flag_a: a --- name: B description: test product B metadata_type: minimal metadata: product_type: eo flag_b: b ''', 'dataset.yml': yaml.safe_dump(ds.doc), }) clirunner(['metadata_type', 'add', p.metadata]) clirunner(['product', 'add', str(prefix / 'products.yml')]) pp = list(index.products.get_all()) assert len(pp) == 2 r = clirunner(['dataset', 'add', str(prefix / 'dataset.yml')]) assert 'ERROR Auto match failed' in r.output assert 'matches several products' in r.output assert index.datasets.has(ds.id) is False # check that forcing product works r = clirunner( ['dataset', 'add', '--product', 'A', str(prefix / 'dataset.yml')]) assert index.datasets.has(ds.id) is True
def check_inconsistent_lineage(clirunner, index): """ A -> B | | | v +--> C -> D | +--> E Add node E, then try adding A with modified E in the lineage, should fail to add ABCD """ ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True)) child_docs = [ds.sources[x].doc for x in ('ae', )] modified_doc = toolz.assoc_in( ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified') prefix = write_files({ 'lineage.yml': yaml.safe_dump_all(child_docs), 'main.yml': yaml.safe_dump(modified_doc), }) clirunner(['dataset', 'add', str(prefix / 'lineage.yml')]) assert index.datasets.get(ds.sources['ae'].id) is not None r = clirunner(['dataset', 'add', str(prefix / 'main.yml')]) assert 'ERROR Inconsistent lineage dataset' in r.output assert index.datasets.has(ds.id) is False assert index.datasets.has(ds.sources['ab'].id) is False assert index.datasets.has(ds.sources['ac'].id) is False assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False # now again but skipping verification check r = clirunner( ['dataset', 'add', '--no-verify-lineage', str(prefix / 'main.yml')]) assert index.datasets.has(ds.id) assert index.datasets.has(ds.sources['ab'].id) assert index.datasets.has(ds.sources['ac'].id) assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
def test_dataset_add_with_nans(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty clirunner(['metadata', 'add', p.metadata]) clirunner(['product', 'add', p.products]) mk = dataset_maker(0) c = mk('C', product_type='C', val_is_nan=math.nan, val_is_inf=math.inf, val_is_neginf=-math.inf) b = mk('B', sources={'bc': c}, product_type='B') a = mk('A', sources={'ac': c}, product_type='A') prefix = write_files({ 'dataset.yml': yaml.safe_dump_all([a, b]), }) r = clirunner([ 'dataset', 'add', '--auto-add-lineage', '--verify-lineage', str(prefix / 'dataset.yml') ]) assert "ERROR" not in r.output a, b, c = [SimpleDocNav(v) for v in (a, b, c)] assert index.datasets.bulk_has([a.id, b.id, c.id]) == [True, True, True] c_doc = index.datasets.get(c.id).metadata_doc assert c_doc['val_is_nan'] == 'NaN' assert c_doc['val_is_inf'] == 'Infinity' assert c_doc['val_is_neginf'] == '-Infinity'
def doc(loc=None): return SimpleDocNav( dict(location=loc, id='4d9fd75c-1309-4712-93b5-f0d9c6fdd8ab'))
def test_dataset_add_inconsistent_measurements(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty mk = dataset_maker(0) # not set, empty, subset, full set, super-set ds1 = SimpleDocNav(mk( 'A', product_type='eo', )) ds2 = SimpleDocNav(mk('B', product_type='eo', measurements={})) ds3 = SimpleDocNav(mk('C', product_type='eo', measurements={'red': {}})) ds4 = SimpleDocNav( mk('D', product_type='eo', measurements={ 'red': {}, 'green': {}, })) ds5 = SimpleDocNav( mk('E', product_type='eo', measurements={ 'red': {}, 'green': {}, 'extra': {}, })) dss = (ds1, ds2, ds3, ds4, ds5) docs = [ds.doc for ds in dss] prefix = write_files({ 'products.yml': ''' name: eo description: test product metadata_type: with_measurements metadata: product_type: eo measurements: - name: red dtype: int16 nodata: -999 units: '1' - name: green dtype: int16 nodata: -999 units: '1' ''', 'dataset.yml': yaml.safe_dump_all(docs), }) clirunner(['metadata', 'add', p.metadata]) r = clirunner(['product', 'add', str(prefix / 'products.yml')]) pp = list(index.products.get_all()) assert len(pp) == 1 r = clirunner(['dataset', 'add', str(prefix / 'dataset.yml')]) print(r.output) r = clirunner(['dataset', 'search', '-f', 'csv']) assert ds1.id not in r.output assert ds2.id not in r.output assert ds3.id not in r.output assert ds4.id in r.output assert ds5.id in r.output
def test_dataset_add(dataset_add_configs, index_empty, clirunner): p = dataset_add_configs index = index_empty r = clirunner(['dataset', 'add', p.datasets], expect_success=False) assert r.exit_code != 0 assert 'Found no products' in r.output clirunner(['metadata', 'add', p.metadata]) clirunner(['product', 'add', p.products]) clirunner(['dataset', 'add', p.datasets]) clirunner(['dataset', 'add', p.datasets_bad1]) clirunner(['dataset', 'add', p.datasets_eo3]) ds = load_dataset_definition(p.datasets) ds_bad1 = load_dataset_definition(p.datasets_bad1) # Check .hl.Doc2Dataset doc2ds = Doc2Dataset(index) _ds, _err = doc2ds(ds.doc, 'file:///something') assert _err is None assert str(_ds.id) == ds.id assert _ds.metadata_doc == ds.doc # Check dataset search r = clirunner(['dataset', 'search'], expect_success=True) assert ds.id in r.output assert ds_bad1.id not in r.output assert ds.sources['ab'].id in r.output assert ds.sources['ac'].sources['cd'].id in r.output r = clirunner(['dataset', 'info', '-f', 'csv', ds.id]) assert ds.id in r.output r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id]) assert ds.sources['ae'].id in r.output r = clirunner([ 'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id ]) assert ds.id in r.output ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) assert ds_.id == ds.id x = index.datasets.get(ds.id, include_sources=True) assert str(x.sources['ab'].id) == ds.sources['ab'].id assert str( x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id check_skip_lineage_test(clirunner, index) check_no_product_match(clirunner, index) check_with_existing_lineage(clirunner, index) check_inconsistent_lineage(clirunner, index) check_missing_metadata_doc(clirunner) check_missing_lineage(clirunner, index) check_no_confirm(clirunner, p.datasets) check_bad_yaml(clirunner, index) # check --product=nosuchproduct r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets], expect_success=False) assert "ERROR Supplied product name" in r.output assert r.exit_code != 0 # Check that deprecated option is accepted r = clirunner(['dataset', 'add', '--auto-match', p.datasets]) assert 'WARNING --auto-match option is deprecated' in r.output # test dataset add eo3 r = clirunner(['dataset', 'add', p.datasets_eo3]) assert r.exit_code == 0 ds_eo3 = load_dataset_definition(p.datasets_eo3) assert ds_eo3.location is not None _ds = index.datasets.get(ds_eo3.id, include_sources=True) assert sorted(_ds.sources) == ['a', 'bc1', 'bc2'] assert _ds.crs == 'EPSG:3857' assert _ds.extent is not None assert _ds.extent.crs == _ds.crs assert _ds.uris == [ds_eo3.location] assert 'location' not in _ds.metadata_doc
def test_simple_doc_nav(): """ A -> B | | | v +--> C -> D | +--> E """ def node(name, **kwargs): return dict(id=name, lineage=dict(source_datasets=kwargs)) A, _, C, _, _ = make_graph_abcde(node) rdr = SimpleDocNav(A) assert rdr.doc == A assert rdr.doc_without_lineage_sources == node('A') assert isinstance(rdr.sources['ae'], SimpleDocNav) assert rdr.sources['ab'].sources['bc'].doc == C assert rdr.doc_without_lineage_sources is rdr.doc_without_lineage_sources assert rdr.sources is rdr.sources assert isinstance(rdr.sources_path, tuple) def visitor(node, name=None, depth=0, out=None): s = '{}:{}:{:d}'.format(node.id, name if name else '..', depth) out.append(s) expect_preorder = ''' A:..:0 B:ab:1 C:bc:2 D:cd:3 C:ac:1 D:cd:2 E:ae:1 '''.lstrip().rstrip() expect_postorder = ''' D:cd:3 C:bc:2 B:ab:1 D:cd:2 C:ac:1 E:ae:1 A:..:0 '''.lstrip().rstrip() for mode, expect in zip(['pre-order', 'post-order'], [expect_preorder, expect_postorder]): out = [] traverse_datasets(rdr, visitor, mode=mode, out=out) assert '\n'.join(out) == expect fv = flatten_datasets(rdr) assert len(fv['A']) == 1 assert len(fv['C']) == 2 assert len(fv['E']) == 1 assert set(fv.keys()) == set('ABCDE') fv, dg = flatten_datasets(rdr, with_depth_grouping=True) assert len(fv['A']) == 1 assert len(fv['C']) == 2 assert len(fv['E']) == 1 assert set(fv.keys()) == set('ABCDE') assert isinstance(dg, list) assert len(dg) == 4 assert [len(l) for l in dg] == [1, 3, 2, 1] def to_set(xx): return set(x.id for x in xx) assert [set(s) for s in ('A', 'BCE', 'CD', 'D')] == [to_set(xx) for xx in dg]
def test_dedup(): ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True)) # make sure ds0 has duplicate C nodes with equivalent data assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc ds = SimpleDocNav(dedup_lineage(ds0)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[ 'ac'].sources['cd'].doc # again but with raw doc ds = SimpleDocNav(dedup_lineage(ds0.doc)) assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[ 'ac'].sources['cd'].doc # Test that we detect inconsistent metadata for duplicate entries ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True)) ds0.sources['ac'].doc['label'] = 'Modified' ds0 = SimpleDocNav(ds0.doc) assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'): dedup_lineage(ds0) # Test that we detect inconsistent lineage subtrees for duplicate entries # Subtest 1: different set of keys ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd'] = {} ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 2: different values for "child" nodes ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be' ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0) # Subtest 3: different name for child ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True)) srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc) assert 'cd' in srcs srcs['CD'] = srcs['cd'] del srcs['cd'] ds0 = SimpleDocNav(ds0.doc) with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'): dedup_lineage(ds0)