def test_traverse_datasets(): """ A -> B | | | v +--> C -> D | +--> E """ def node(name, **kwargs): return SimpleNamespace(id=name, sources=kwargs) A, *_ = make_graph_abcde(node) def visitor(node, name=None, depth=0, out=None): s = '{}:{}:{:d}'.format(node.id, name if name else '..', depth) out.append(s) with pytest.raises(ValueError): traverse_datasets(A, visitor, mode='not-a-real-mode') expect_preorder = ''' A:..:0 B:ab:1 C:bc:2 D:cd:3 C:ac:1 D:cd:2 E:ae:1 '''.lstrip().rstrip() expect_postorder = ''' D:cd:3 C:bc:2 B:ab:1 D:cd:2 C:ac:1 E:ae:1 A:..:0 '''.lstrip().rstrip() for mode, expect in zip(['pre-order', 'post-order'], [expect_preorder, expect_postorder]): out = [] traverse_datasets(A, visitor, mode=mode, out=out) assert '\n'.join(out) == expect fv = flatten_datasets(A) assert len(fv['A']) == 1 assert len(fv['C']) == 2 assert len(fv['E']) == 1 assert set(fv.keys()) == set('ABCDE') leaf = SimpleNamespace(id='N', sources=None) out = [] traverse_datasets(leaf, visitor, out=out) assert out == ["N:..:0"]
def resolve(main_ds, uri): try: main_ds = SimpleDocNav(dedup_lineage(main_ds)) except InvalidDocException as e: return None, e main_uuid = main_ds.id ds_by_uuid = toolz.valmap(toolz.first, flatten_datasets(main_ds)) all_uuid = list(ds_by_uuid) db_dss = {str(ds.id): ds for ds in index.datasets.bulk_get(all_uuid)} lineage_uuids = set(filter(lambda x: x != main_uuid, all_uuid)) missing_lineage = lineage_uuids - set(db_dss) if missing_lineage and fail_on_missing_lineage: return None, "Following lineage datasets are missing from DB: %s" % (','.join(missing_lineage)) if verify_lineage and not is_doc_eo3(main_ds.doc): bad_lineage = [] for uuid in lineage_uuids: if uuid in db_dss: ok, err = check_consistent(jsonify_document(ds_by_uuid[uuid].doc_without_lineage_sources), db_dss[uuid].metadata_doc) if not ok: bad_lineage.append((uuid, err)) if len(bad_lineage) > 0: error_report = '\n'.join('Inconsistent lineage dataset {}:\n> {}'.format(uuid, err) for uuid, err in bad_lineage) return None, error_report def with_cache(v, k, cache): cache[k] = v return v def resolve_ds(ds, sources, cache=None): cached = cache.get(ds.id) if cached is not None: return cached uris = [uri] if ds.id == main_uuid else [] doc = ds.doc db_ds = db_dss.get(ds.id) if db_ds: product = db_ds.type else: product = match_product(doc) return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache) try: return remap_lineage_doc(main_ds, resolve_ds, cache={}), None except BadMatch as e: return None, e
def test_simple_doc_nav(): """ A -> B | | | v +--> C -> D | +--> E """ def node(name, **kwargs): return dict(id=name, lineage=dict(source_datasets=kwargs)) A, _, C, _, _ = make_graph_abcde(node) rdr = SimpleDocNav(A) assert rdr.doc == A assert rdr.doc_without_lineage_sources == node('A') assert isinstance(rdr.sources['ae'], SimpleDocNav) assert rdr.sources['ab'].sources['bc'].doc == C assert rdr.doc_without_lineage_sources is rdr.doc_without_lineage_sources assert rdr.sources is rdr.sources assert isinstance(rdr.sources_path, tuple) def visitor(node, name=None, depth=0, out=None): s = '{}:{}:{:d}'.format(node.id, name if name else '..', depth) out.append(s) expect_preorder = ''' A:..:0 B:ab:1 C:bc:2 D:cd:3 C:ac:1 D:cd:2 E:ae:1 '''.lstrip().rstrip() expect_postorder = ''' D:cd:3 C:bc:2 B:ab:1 D:cd:2 C:ac:1 E:ae:1 A:..:0 '''.lstrip().rstrip() for mode, expect in zip(['pre-order', 'post-order'], [expect_preorder, expect_postorder]): out = [] traverse_datasets(rdr, visitor, mode=mode, out=out) assert '\n'.join(out) == expect fv = flatten_datasets(rdr) assert len(fv['A']) == 1 assert len(fv['C']) == 2 assert len(fv['E']) == 1 assert set(fv.keys()) == set('ABCDE') fv, dg = flatten_datasets(rdr, with_depth_grouping=True) assert len(fv['A']) == 1 assert len(fv['C']) == 2 assert len(fv['E']) == 1 assert set(fv.keys()) == set('ABCDE') assert isinstance(dg, list) assert len(dg) == 4 assert [len(l) for l in dg] == [1, 3, 2, 1] def to_set(xx): return set(x.id for x in xx) assert [set(s) for s in ('A', 'BCE', 'CD', 'D')] == [to_set(xx) for xx in dg]
def add(self, dataset, with_lineage=None, **kwargs): """ Add ``dataset`` to the index. No-op if it is already present. :param Dataset dataset: dataset to add :param bool with_lineage: True -- attempt adding lineage if it's missing, False don't :rtype: Dataset """ def process_bunch(dss, main_ds, transaction): edges = [] # First insert all new datasets for ds in dss: is_new = transaction.insert_dataset( ds.metadata_doc_without_lineage(), ds.id, ds.type.id) if is_new: edges.extend((name, ds.id, src.id) for name, src in ds.sources.items()) # Second insert lineage graph edges for ee in edges: transaction.insert_dataset_source(*ee) # Finally update location for top-level dataset only if main_ds.uris is not None: self._ensure_new_locations(main_ds, transaction=transaction) if with_lineage is None: policy = kwargs.pop('sources_policy', None) if policy is not None: _LOG.debug('Use of sources_policy is deprecated') with_lineage = (policy != "skip") if policy == 'verify': _LOG.debug('Verify is no longer done inside add') else: with_lineage = True _LOG.info('Indexing %s', dataset.id) if with_lineage: ds_by_uuid = flatten_datasets(dataset) all_uuids = list(ds_by_uuid) present = { k: v for k, v in zip(all_uuids, self.bulk_has(all_uuids)) } if present[dataset.id]: _LOG.warning('Dataset %s is already in the database', dataset.id) return dataset dss = [ ds for ds in [dss[0] for dss in ds_by_uuid.values()] if not present[ds.id] ] else: if self.has(dataset.id): _LOG.warning('Dataset %s is already in the database', dataset.id) return dataset dss = [dataset] with self._db.begin() as transaction: process_bunch(dss, dataset, transaction) return dataset