Ejemplo n.º 1
0
    def __call__(self, doc, uri):
        """Attempt to construct dataset from metadata document and a uri.

        :param doc: Dictionary or SimpleDocNav object
        :param uri: String "location" property of the Dataset

        :return: (dataset, None) is successful,
        :return: (None, ErrorMessage) on failure
        """
        if not isinstance(doc, SimpleDocNav):
            doc = SimpleDocNav(doc)

        if self._eo3:
            auto_skip = self._eo3 == 'auto'
            doc = SimpleDocNav(prep_eo3(doc.doc, auto_skip=auto_skip))

        dataset, err = self._ds_resolve(doc, uri)
        if dataset is None:
            return None, err

        is_consistent, reason = check_dataset_consistent(dataset)
        if not is_consistent:
            return None, reason

        return dataset, None
def check_missing_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Use --no-auto-add-lineage
    """
    ds = SimpleDocNav(gen_dataset_test_dag(44, force_tree=True))
    child_docs = [ds.sources[x].doc for x in ('ae', 'ab', 'ac')]

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(ds.doc),
    })

    r = clirunner(
        ['dataset', 'add', '--no-auto-add-lineage',
         str(prefix / 'main.yml')])

    assert 'ERROR Following lineage datasets are missing' in r.output
    assert index.datasets.has(ds.id) is False

    # now add lineage and try again
    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.has(ds.sources['ae'].id)
    r = clirunner(
        ['dataset', 'add', '--no-auto-add-lineage',
         str(prefix / 'main.yml')])

    assert index.datasets.has(ds.id)
def check_with_existing_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Add nodes BCE(D) with auto-matching, then add node A with product restricted to A only.
    """

    ds = SimpleDocNav(gen_dataset_test_dag(33, force_tree=True))

    child_docs = [ds.sources[x].doc for x in ('ab', 'ac', 'ae')]

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(ds.doc),
    })

    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.get(ds.sources['ae'].id) is not None
    assert index.datasets.get(ds.sources['ab'].id) is not None
    assert index.datasets.get(ds.sources['ac'].id) is not None

    clirunner([
        'dataset', 'add', '--no-auto-add-lineage', '--product', 'A',
        str(prefix / 'main.yml')
    ])

    assert index.datasets.get(ds.id) is not None
Ejemplo n.º 4
0
def doc_path_stream(files, on_error, uri=True):
    for fname in files:
        try:
            for p, doc in read_documents(fname, uri=uri):
                yield p, SimpleDocNav(doc)
        except InvalidDocException as e:
            on_error(fname, e)
Ejemplo n.º 5
0
def load_dataset_definition(path):
    if not isinstance(path, pathlib.Path):
        path = pathlib.Path(path)

    fname = get_metadata_path(path)
    for _, doc in read_documents(fname):
        return SimpleDocNav(doc)
def check_no_product_match(clirunner, index):
    ds = SimpleDocNav(gen_dataset_test_dag(22, force_tree=True))

    prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)})

    r = clirunner(['dataset', 'add',
                   '--product', 'A',
                   str(prefix)])
    assert 'ERROR Dataset metadata did not match product signature' in r.output

    r = clirunner(['dataset', 'add',
                   '--product', 'A',
                   '--product', 'B',
                   str(prefix)])
    assert 'ERROR No matching Product found for dataset' in r.output

    ds_ = index.datasets.get(ds.id, include_sources=True)
    assert ds_ is None

    # Ignore lineage but fail to match main dataset
    r = clirunner(['dataset', 'add',
                   '--product', 'B',
                   '--confirm-ignore-lineage',
                   str(prefix)])

    assert 'ERROR Dataset metadata did not match product signature' in r.output
    assert index.datasets.has(ds.id) is False
Ejemplo n.º 7
0
def test_dataset_add(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    r = clirunner(['dataset', 'add', p.datasets], expect_success=False)
    assert r.exit_code != 0
    assert 'Found no products' in r.output

    clirunner(['metadata_type', 'add', p.metadata])
    clirunner(['product', 'add', p.products])
    clirunner(['dataset', 'add', p.datasets])
    clirunner(['dataset', 'add', p.datasets_bad1])

    ds = load_dataset_definition(p.datasets)
    ds_bad1 = load_dataset_definition(p.datasets_bad1)

    r = clirunner(['dataset', 'search'], expect_success=True)
    assert ds.id in r.output
    assert ds_bad1.id not in r.output
    assert ds.sources['ab'].id in r.output
    assert ds.sources['ac'].sources['cd'].id in r.output

    r = clirunner(['dataset', 'info', '-f', 'csv', ds.id])
    assert ds.id in r.output

    r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id])
    assert ds.sources['ae'].id in r.output

    r = clirunner([
        'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id
    ])
    assert ds.id in r.output

    ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))
    assert ds_.id == ds.id

    x = index.datasets.get(ds.id, include_sources=True)
    assert str(x.sources['ab'].id) == ds.sources['ab'].id
    assert str(
        x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id

    check_skip_lineage_test(clirunner, index)
    check_no_product_match(clirunner, index)
    check_with_existing_lineage(clirunner, index)
    check_inconsistent_lineage(clirunner, index)
    check_missing_metadata_doc(clirunner)
    check_missing_lineage(clirunner, index)
    check_no_confirm(clirunner, p.datasets)
    check_bad_yaml(clirunner, index)

    # check --product=nosuchproduct
    r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets],
                  expect_success=False)

    assert "ERROR Supplied product name" in r.output
    assert r.exit_code != 0

    # Check that deprecated option is accepted
    r = clirunner(['dataset', 'add', '--auto-match', p.datasets])
    assert 'WARNING --auto-match option is deprecated' in r.output
Ejemplo n.º 8
0
    def resolve(main_ds, uri):
        try:
            main_ds = SimpleDocNav(dedup_lineage(main_ds))
        except InvalidDocException as e:
            return None, e

        main_uuid = main_ds.id

        ds_by_uuid = toolz.valmap(toolz.first, flatten_datasets(main_ds))
        all_uuid = list(ds_by_uuid)
        db_dss = {str(ds.id): ds for ds in index.datasets.bulk_get(all_uuid)}

        lineage_uuids = set(filter(lambda x: x != main_uuid, all_uuid))
        missing_lineage = lineage_uuids - set(db_dss)

        if missing_lineage and fail_on_missing_lineage:
            return None, "Following lineage datasets are missing from DB: %s" % (','.join(missing_lineage))

        if verify_lineage and not is_doc_eo3(main_ds.doc):
            bad_lineage = []

            for uuid in lineage_uuids:
                if uuid in db_dss:
                    ok, err = check_consistent(jsonify_document(ds_by_uuid[uuid].doc_without_lineage_sources),
                                               db_dss[uuid].metadata_doc)
                    if not ok:
                        bad_lineage.append((uuid, err))

            if len(bad_lineage) > 0:
                error_report = '\n'.join('Inconsistent lineage dataset {}:\n> {}'.format(uuid, err)
                                         for uuid, err in bad_lineage)
                return None, error_report

        def with_cache(v, k, cache):
            cache[k] = v
            return v

        def resolve_ds(ds, sources, cache=None):
            cached = cache.get(ds.id)
            if cached is not None:
                return cached

            uris = [uri] if ds.id == main_uuid else []

            doc = ds.doc

            db_ds = db_dss.get(ds.id)
            if db_ds:
                product = db_ds.type
            else:
                product = match_product(doc)

            return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache)

        try:
            return remap_lineage_doc(main_ds, resolve_ds, cache={}), None
        except BadMatch as e:
            return None, e
def test_dataset_add_ambgious_products(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty

    dss = [SimpleDocNav(dataset_maker(i)(
        'A',
        product_type='eo',
        flag_a='a',
        flag_b='b')) for i in [1, 2]]

    prefix = write_files({
        'products.yml': '''
name: A
description: test product A
metadata_type: minimal
metadata:
    product_type: eo
    flag_a: a

---
name: B
description: test product B
metadata_type: minimal
metadata:
    product_type: eo
    flag_b: b
    ''',
        'dataset1.yml': yaml.safe_dump(dss[0].doc),
        'dataset2.yml': yaml.safe_dump(dss[1].doc),
    })

    clirunner(['metadata', 'add', p.metadata])
    clirunner(['product', 'add', str(prefix / 'products.yml')])

    pp = list(index.products.get_all())
    assert len(pp) == 2

    for ds, i in zip(dss, (1, 2)):
        r = clirunner(['dataset', 'add', str(prefix / ('dataset%d.yml' % i))])
        assert 'ERROR Auto match failed' in r.output
        assert 'matches several products' in r.output
        assert index.datasets.has(ds.id) is False

    # check that forcing product works
    ds, fname = dss[0], 'dataset1.yml'
    r = clirunner(['dataset', 'add',
                   '--product', 'A',
                   str(prefix / fname)])

    assert index.datasets.has(ds.id) is True

    # check that forcing via exclude works
    ds, fname = dss[1], 'dataset2.yml'
    r = clirunner(['dataset', 'add',
                   '--exclude-product', 'B',
                   str(prefix / fname)])

    assert index.datasets.has(ds.id) is True
Ejemplo n.º 10
0
def dedup_lineage(root):
    """Find duplicate nodes in the lineage tree and replace them with references.

    Will raise `ValueError` when duplicate dataset (same uuid, but different
    path from root) has either conflicting metadata or conflicting lineage
    data.

    :param dict|SimpleDocNav root:

    Returns a new document that has the same structure as input document, but
    with duplicate entries now being aliases rather than copies.
    """
    def check_sources(a, b):
        """ True if two dictionaries contain same objects under the same names.
        same, not just equivalent.
        """
        if len(a) != len(b):
            return False

        for ((ak, av), (bk, bv)) in zip(sorted_items(a), sorted_items(b)):
            if ak != bk:
                return False
            if av is not bv:
                return False

        return True

    def mk_node(ds, sources, cache, sources_path):
        existing = cache.get(ds.id, None)
        doc = ds.doc_without_lineage_sources

        if existing is not None:
            _ds, _doc, _sources = existing

            if not check_sources(sources, _sources):
                raise InvalidDocException(
                    'Inconsistent lineage for repeated dataset with _id: {}'.
                    format(ds.id))

            if doc != _doc:
                raise InvalidDocException(
                    'Inconsistent metadata for repeated dataset with _id: {}'.
                    format(ds.id))

            return _ds

        out_ds = toolz.assoc_in(doc, sources_path, sources)
        cache[ds.id] = (out_ds, doc, sources)
        return out_ds

    if not isinstance(root, SimpleDocNav):
        root = SimpleDocNav(root)

    return remap_lineage_doc(root,
                             mk_node,
                             cache={},
                             sources_path=root.sources_path)
Ejemplo n.º 11
0
def remap_lineage_doc(root, mk_node, **kwargs):
    def visit(ds):
        return mk_node(ds, {k: visit(v)
                            for k, v in sorted_items(ds.sources)}, **kwargs)

    if not isinstance(root, SimpleDocNav):
        root = SimpleDocNav(root)

    return visit(root)
Ejemplo n.º 12
0
def test_dataset_maker():
    mk = dataset_maker(0)
    assert mk('aa') == mk('aa')

    a = SimpleDocNav(mk('A'))
    b = SimpleDocNav(mk('B'))

    assert a.id != b.id
    assert a.doc['creation_dt'] == b.doc['creation_dt']
    assert isinstance(a.id, str)
    assert a.sources == {}

    a1, a2 = [dataset_maker(i)('A', product_type='eo') for i in (0, 1)]
    assert a1['id'] != a2['id']
    assert a1['creation_dt'] != a2['creation_dt']
    assert a1['product_type'] == 'eo'

    c = SimpleDocNav(mk('C', sources=dict(a=a.doc, b=b.doc)))
    assert c.sources['a'].doc is a.doc
    assert c.sources['b'].doc is b.doc
Ejemplo n.º 13
0
def test_remap_lineage_doc():
    def mk_node(ds, sources):
        return dict(id=ds.id, **sources)

    ds = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    xx = remap_lineage_doc(ds, mk_node)
    assert xx['id'] == ds.id
    assert xx['ac']['id'] == ds.sources['ac'].id

    xx = remap_lineage_doc(ds.doc, mk_node)
    assert xx['id'] == ds.id
    assert xx['ac']['id'] == ds.sources['ac'].id
Ejemplo n.º 14
0
    def mk_dataset(ds, uri):
        uuid = ds.id

        if uuid is None:
            return None, None, "Metadata document it missing id field"

        existing = index.datasets.get(uuid)
        if existing is None:
            return None, None, "No such dataset in the database: {}".format(uuid)

        ds = SimpleDocNav(prep_eo3(ds.doc, auto_skip=True))

        # TODO: what about sources=?
        return Dataset(existing.type,
                       ds.doc_without_lineage_sources,
                       uris=[uri]), existing, None
Ejemplo n.º 15
0
def check_skip_lineage_test(clirunner, index):
    ds = SimpleDocNav(gen_dataset_test_dag(11, force_tree=True))

    prefix = write_files({'agdc-metadata.yml': yaml.safe_dump(ds.doc)})

    clirunner(['dataset', 'add', '--confirm-ignore-lineage', '--product', 'A', str(prefix)])

    ds_ = index.datasets.get(ds.id, include_sources=True)
    assert ds_ is not None
    assert str(ds_.id) == ds.id
    assert ds_.sources == {}

    assert index.datasets.get(ds.sources['ab'].id) is None
    assert index.datasets.get(ds.sources['ac'].id) is None
    assert index.datasets.get(ds.sources['ae'].id) is None
    assert index.datasets.get(ds.sources['ac'].sources['cd'].id) is None
Ejemplo n.º 16
0
def test_dataset_add_ambgious_products(dataset_add_configs, index_empty,
                                       clirunner):
    p = dataset_add_configs
    index = index_empty
    mk = dataset_maker(0)

    ds = SimpleDocNav(mk('A', product_type='eo', flag_a='a', flag_b='b'))

    prefix = write_files({
        'products.yml': '''
name: A
description: test product A
metadata_type: minimal
metadata:
    product_type: eo
    flag_a: a

---
name: B
description: test product B
metadata_type: minimal
metadata:
    product_type: eo
    flag_b: b
    ''',
        'dataset.yml': yaml.safe_dump(ds.doc),
    })

    clirunner(['metadata_type', 'add', p.metadata])
    clirunner(['product', 'add', str(prefix / 'products.yml')])

    pp = list(index.products.get_all())
    assert len(pp) == 2

    r = clirunner(['dataset', 'add', str(prefix / 'dataset.yml')])
    assert 'ERROR Auto match failed' in r.output
    assert 'matches several products' in r.output
    assert index.datasets.has(ds.id) is False

    # check that forcing product works
    r = clirunner(
        ['dataset', 'add', '--product', 'A',
         str(prefix / 'dataset.yml')])

    assert index.datasets.has(ds.id) is True
Ejemplo n.º 17
0
def check_inconsistent_lineage(clirunner, index):
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E

    Add node E,
    then try adding A with modified E in the lineage, should fail to add ABCD
    """
    ds = SimpleDocNav(gen_dataset_test_dag(1313, force_tree=True))

    child_docs = [ds.sources[x].doc for x in ('ae', )]
    modified_doc = toolz.assoc_in(
        ds.doc, 'lineage.source_datasets.ae.label'.split('.'), 'modified')

    prefix = write_files({
        'lineage.yml': yaml.safe_dump_all(child_docs),
        'main.yml': yaml.safe_dump(modified_doc),
    })

    clirunner(['dataset', 'add', str(prefix / 'lineage.yml')])
    assert index.datasets.get(ds.sources['ae'].id) is not None

    r = clirunner(['dataset', 'add', str(prefix / 'main.yml')])

    assert 'ERROR Inconsistent lineage dataset' in r.output

    assert index.datasets.has(ds.id) is False
    assert index.datasets.has(ds.sources['ab'].id) is False
    assert index.datasets.has(ds.sources['ac'].id) is False
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id) is False

    # now again but skipping verification check
    r = clirunner(
        ['dataset', 'add', '--no-verify-lineage',
         str(prefix / 'main.yml')])

    assert index.datasets.has(ds.id)
    assert index.datasets.has(ds.sources['ab'].id)
    assert index.datasets.has(ds.sources['ac'].id)
    assert index.datasets.has(ds.sources['ac'].sources['cd'].id)
Ejemplo n.º 18
0
def test_dataset_add_with_nans(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty

    clirunner(['metadata', 'add', p.metadata])
    clirunner(['product', 'add', p.products])

    mk = dataset_maker(0)

    c = mk('C',
           product_type='C',
           val_is_nan=math.nan,
           val_is_inf=math.inf,
           val_is_neginf=-math.inf)

    b = mk('B', sources={'bc': c}, product_type='B')
    a = mk('A', sources={'ac': c}, product_type='A')

    prefix = write_files({
        'dataset.yml': yaml.safe_dump_all([a, b]),
    })

    r = clirunner([
        'dataset', 'add', '--auto-add-lineage', '--verify-lineage',
        str(prefix / 'dataset.yml')
    ])

    assert "ERROR" not in r.output

    a, b, c = [SimpleDocNav(v) for v in (a, b, c)]

    assert index.datasets.bulk_has([a.id, b.id, c.id]) == [True, True, True]

    c_doc = index.datasets.get(c.id).metadata_doc

    assert c_doc['val_is_nan'] == 'NaN'
    assert c_doc['val_is_inf'] == 'Infinity'
    assert c_doc['val_is_neginf'] == '-Infinity'
Ejemplo n.º 19
0
 def doc(loc=None):
     return SimpleDocNav(
         dict(location=loc, id='4d9fd75c-1309-4712-93b5-f0d9c6fdd8ab'))
Ejemplo n.º 20
0
def test_dataset_add_inconsistent_measurements(dataset_add_configs,
                                               index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    mk = dataset_maker(0)

    # not set, empty, subset, full set, super-set
    ds1 = SimpleDocNav(mk(
        'A',
        product_type='eo',
    ))
    ds2 = SimpleDocNav(mk('B', product_type='eo', measurements={}))
    ds3 = SimpleDocNav(mk('C', product_type='eo', measurements={'red': {}}))
    ds4 = SimpleDocNav(
        mk('D', product_type='eo', measurements={
            'red': {},
            'green': {},
        }))
    ds5 = SimpleDocNav(
        mk('E',
           product_type='eo',
           measurements={
               'red': {},
               'green': {},
               'extra': {},
           }))

    dss = (ds1, ds2, ds3, ds4, ds5)
    docs = [ds.doc for ds in dss]

    prefix = write_files({
        'products.yml': '''
name: eo
description: test product
metadata_type: with_measurements
metadata:
    product_type: eo

measurements:
    - name: red
      dtype: int16
      nodata: -999
      units: '1'

    - name: green
      dtype: int16
      nodata: -999
      units: '1'
    ''',
        'dataset.yml': yaml.safe_dump_all(docs),
    })

    clirunner(['metadata', 'add', p.metadata])
    r = clirunner(['product', 'add', str(prefix / 'products.yml')])

    pp = list(index.products.get_all())
    assert len(pp) == 1

    r = clirunner(['dataset', 'add', str(prefix / 'dataset.yml')])
    print(r.output)

    r = clirunner(['dataset', 'search', '-f', 'csv'])
    assert ds1.id not in r.output
    assert ds2.id not in r.output
    assert ds3.id not in r.output
    assert ds4.id in r.output
    assert ds5.id in r.output
Ejemplo n.º 21
0
def test_dataset_add(dataset_add_configs, index_empty, clirunner):
    p = dataset_add_configs
    index = index_empty
    r = clirunner(['dataset', 'add', p.datasets], expect_success=False)
    assert r.exit_code != 0
    assert 'Found no products' in r.output

    clirunner(['metadata', 'add', p.metadata])
    clirunner(['product', 'add', p.products])
    clirunner(['dataset', 'add', p.datasets])
    clirunner(['dataset', 'add', p.datasets_bad1])
    clirunner(['dataset', 'add', p.datasets_eo3])

    ds = load_dataset_definition(p.datasets)
    ds_bad1 = load_dataset_definition(p.datasets_bad1)

    # Check .hl.Doc2Dataset
    doc2ds = Doc2Dataset(index)
    _ds, _err = doc2ds(ds.doc, 'file:///something')
    assert _err is None
    assert str(_ds.id) == ds.id
    assert _ds.metadata_doc == ds.doc

    # Check dataset search

    r = clirunner(['dataset', 'search'], expect_success=True)
    assert ds.id in r.output
    assert ds_bad1.id not in r.output
    assert ds.sources['ab'].id in r.output
    assert ds.sources['ac'].sources['cd'].id in r.output

    r = clirunner(['dataset', 'info', '-f', 'csv', ds.id])
    assert ds.id in r.output

    r = clirunner(['dataset', 'info', '-f', 'yaml', '--show-sources', ds.id])
    assert ds.sources['ae'].id in r.output

    r = clirunner([
        'dataset', 'info', '-f', 'yaml', '--show-derived', ds.sources['ae'].id
    ])
    assert ds.id in r.output

    ds_ = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))
    assert ds_.id == ds.id

    x = index.datasets.get(ds.id, include_sources=True)
    assert str(x.sources['ab'].id) == ds.sources['ab'].id
    assert str(
        x.sources['ac'].sources['cd'].id) == ds.sources['ac'].sources['cd'].id

    check_skip_lineage_test(clirunner, index)
    check_no_product_match(clirunner, index)
    check_with_existing_lineage(clirunner, index)
    check_inconsistent_lineage(clirunner, index)
    check_missing_metadata_doc(clirunner)
    check_missing_lineage(clirunner, index)
    check_no_confirm(clirunner, p.datasets)
    check_bad_yaml(clirunner, index)

    # check --product=nosuchproduct
    r = clirunner(['dataset', 'add', '--product', 'nosuchproduct', p.datasets],
                  expect_success=False)

    assert "ERROR Supplied product name" in r.output
    assert r.exit_code != 0

    # Check that deprecated option is accepted
    r = clirunner(['dataset', 'add', '--auto-match', p.datasets])
    assert 'WARNING --auto-match option is deprecated' in r.output

    # test dataset add eo3
    r = clirunner(['dataset', 'add', p.datasets_eo3])
    assert r.exit_code == 0

    ds_eo3 = load_dataset_definition(p.datasets_eo3)
    assert ds_eo3.location is not None

    _ds = index.datasets.get(ds_eo3.id, include_sources=True)
    assert sorted(_ds.sources) == ['a', 'bc1', 'bc2']
    assert _ds.crs == 'EPSG:3857'
    assert _ds.extent is not None
    assert _ds.extent.crs == _ds.crs
    assert _ds.uris == [ds_eo3.location]
    assert 'location' not in _ds.metadata_doc
Ejemplo n.º 22
0
def test_simple_doc_nav():
    """
      A -> B
      |    |
      |    v
      +--> C -> D
      |
      +--> E
    """
    def node(name, **kwargs):
        return dict(id=name, lineage=dict(source_datasets=kwargs))

    A, _, C, _, _ = make_graph_abcde(node)
    rdr = SimpleDocNav(A)

    assert rdr.doc == A
    assert rdr.doc_without_lineage_sources == node('A')
    assert isinstance(rdr.sources['ae'], SimpleDocNav)
    assert rdr.sources['ab'].sources['bc'].doc == C
    assert rdr.doc_without_lineage_sources is rdr.doc_without_lineage_sources
    assert rdr.sources is rdr.sources
    assert isinstance(rdr.sources_path, tuple)

    def visitor(node, name=None, depth=0, out=None):
        s = '{}:{}:{:d}'.format(node.id, name if name else '..', depth)
        out.append(s)

    expect_preorder = '''
A:..:0
B:ab:1
C:bc:2
D:cd:3
C:ac:1
D:cd:2
E:ae:1
'''.lstrip().rstrip()

    expect_postorder = '''
D:cd:3
C:bc:2
B:ab:1
D:cd:2
C:ac:1
E:ae:1
A:..:0
'''.lstrip().rstrip()

    for mode, expect in zip(['pre-order', 'post-order'],
                            [expect_preorder, expect_postorder]):
        out = []
        traverse_datasets(rdr, visitor, mode=mode, out=out)
        assert '\n'.join(out) == expect

    fv = flatten_datasets(rdr)

    assert len(fv['A']) == 1
    assert len(fv['C']) == 2
    assert len(fv['E']) == 1
    assert set(fv.keys()) == set('ABCDE')

    fv, dg = flatten_datasets(rdr, with_depth_grouping=True)

    assert len(fv['A']) == 1
    assert len(fv['C']) == 2
    assert len(fv['E']) == 1
    assert set(fv.keys()) == set('ABCDE')
    assert isinstance(dg, list)
    assert len(dg) == 4
    assert [len(l) for l in dg] == [1, 3, 2, 1]

    def to_set(xx):
        return set(x.id for x in xx)

    assert [set(s)
            for s in ('A', 'BCE', 'CD', 'D')] == [to_set(xx) for xx in dg]
Ejemplo n.º 23
0
def test_dedup():
    ds0 = SimpleDocNav(gen_dataset_test_dag(1, force_tree=True))

    # make sure ds0 has duplicate C nodes with equivalent data
    assert ds0.sources['ab'].sources['bc'].doc is not ds0.sources['ac'].doc
    assert ds0.sources['ab'].sources['bc'].doc == ds0.sources['ac'].doc

    ds = SimpleDocNav(dedup_lineage(ds0))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[
        'ac'].sources['cd'].doc

    # again but with raw doc
    ds = SimpleDocNav(dedup_lineage(ds0.doc))
    assert ds.sources['ab'].sources['bc'].doc is ds.sources['ac'].doc
    assert ds.sources['ab'].sources['bc'].sources['cd'].doc is ds.sources[
        'ac'].sources['cd'].doc

    # Test that we detect inconsistent metadata for duplicate entries
    ds0 = SimpleDocNav(gen_dataset_test_dag(3, force_tree=True))
    ds0.sources['ac'].doc['label'] = 'Modified'
    ds0 = SimpleDocNav(ds0.doc)
    assert ds0.sources['ab'].sources['bc'].doc != ds0.sources['ac'].doc

    with pytest.raises(InvalidDocException, match=r'Inconsistent metadata .*'):
        dedup_lineage(ds0)

    # Test that we detect inconsistent lineage subtrees for duplicate entries

    # Subtest 1: different set of keys
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd'] = {}
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 2: different values for "child" nodes
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['cd']['id'] = '7fe57724-ed44-4beb-a3ab-c275339049be'
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)

    # Subtest 3: different name for child
    ds0 = SimpleDocNav(gen_dataset_test_dag(7, force_tree=True))
    srcs = toolz.get_in(ds0.sources_path, ds0.sources['ac'].doc)

    assert 'cd' in srcs
    srcs['CD'] = srcs['cd']
    del srcs['cd']
    ds0 = SimpleDocNav(ds0.doc)

    with pytest.raises(InvalidDocException, match=r'Inconsistent lineage .*'):
        dedup_lineage(ds0)