def test_column_lineage_entities_with_classifications():
    reader = Reader(READER_CONFIG)

    json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities()

    # Update target to include a classification
    json_columns[0].update({
        "Target classifications": "CustomerInfo; PII",
        "Source classifications": ""
    })

    # Outputs -1003 as the last guid
    tables_and_processes = reader.parse_table_lineage(json_tables)

    results = reader.parse_column_lineage(json_columns, tables_and_processes,
                                          atlas_typedefs)

    # Two column entities
    # One process entity
    target_col_entity = results[0]
    source_col_entity = results[1]
    col_lineage_entity = results[2]

    assert (len(target_col_entity.classifications) == 2)
    assert ({
        "typeName": "CustomerInfo",
        "attributes": {}
    } in target_col_entity.classifications)
    assert ({
        "typeName": "PII",
        "attributes": {}
    } in target_col_entity.classifications)
    assert (len(source_col_entity.classifications) == 0)
def test_parse_update_lineage_multi_out():

    reader = Reader(READER_CONFIG)

    json_rows = [{
        "Target typeName": "demo_table",
        "Target qualifiedName": "demotarget",
        "Source typeName": "demo_table2",
        "Source qualifiedName": "demosource",
        "Process name": "proc01",
        "Process qualifiedName": "procqual01",
        "Process typeName": "Process2"
    }, {
        "Target typeName": "demo_table",
        "Target qualifiedName": "demotarget2",
        "Source typeName": None,
        "Source qualifiedName": None,
        "Process name": "proc01",
        "Process qualifiedName": "procqual01",
        "Process typeName": "Process2"
    }]

    results = reader.parse_update_lineage(json_rows)

    assert (len(results) == 1)
    inputs = results[0]["attributes"]["inputs"]
    outputs = results[0]["attributes"]["outputs"]
    assert (len(outputs) == 2)
    output_names = set(
        [x["uniqueAttributes"]["qualifiedName"] for x in outputs])
    assert (output_names == set(["demotarget", "demotarget2"]))
    assert (len(inputs) == 1)
def test_parse_update_lineage_multi_row_with_na_last():

    reader = Reader(READER_CONFIG)

    json_rows = [
        {
            "Target typeName": "demo_table",
            "Target qualifiedName": "demotarget",
            "Source typeName": "demo_table2",
            "Source qualifiedName": "demosource",
            "Process name": "proc01",
            "Process qualifiedName": "procqual01",
            "Process typeName": "Process2"
        },
        {
            "Target typeName": "N/A",
            "Target qualifiedName": "N/A",
            "Source typeName": "demo_table2",
            "Source qualifiedName": "demosource2",
            "Process name": "proc01",
            "Process qualifiedName": "procqual01",
            "Process typeName": "Process2"
        },
    ]
    with pytest.warns(UserWarning):
        results = reader.parse_update_lineage(json_rows)
    assert (len(results) == 1)
    inputs = results[0]["attributes"]["inputs"]
    outputs = results[0]["attributes"]["outputs"]
    assert (len(outputs) == 0)
    assert (len(inputs) == 2)
def test_column_lineage_entities_with_attributes():
    reader = Reader(READER_CONFIG)

    json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities()

    # Update target to include an attribute
    json_columns[0].update({
        "Target test_attrib1": "value",
        "Target test_attrib2": "value2",
        "Source foo": "bar"
    })

    # Outputs -1003 as the last guid
    tables_and_processes = reader.parse_table_lineage(json_tables)

    results = reader.parse_column_lineage(json_columns, tables_and_processes,
                                          atlas_typedefs)

    # Two column entities
    # One process entity
    target_col_entity = results[0]
    source_col_entity = results[1]
    col_lineage_entity = results[2]

    assert (target_col_entity.attributes["test_attrib1"] == "value")
    assert (target_col_entity.attributes["test_attrib2"] == "value2")
    assert (source_col_entity.attributes["foo"] == "bar")
def test_table_lineage():
    reader = Reader(READER_CONFIG)
    json_rows = [{
        "Target table": "table1",
        "Target type": "demo_type",
        "Source table": "table0",
        "Source type": "demo_type2",
        "Process name": "proc01",
        "Process type": "proc_type"
    }]

    results = reader.parse_table_lineage(json_rows)

    assert (results[0].to_json(minimum=True) == {
        "typeName": "demo_type",
        "guid": -1001,
        "qualifiedName": "table1"
    })
    assert (results[1].to_json(minimum=True) == {
        "typeName": "demo_type2",
        "guid": -1002,
        "qualifiedName": "table0"
    })
    assert (results[2].to_json(minimum=True) == {
        "typeName": "proc_type",
        "guid": -1003,
        "qualifiedName": "proc01"
    })
Example #6
0
def test_parse_entity_defs():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    # "Entity TypeName", "name", "description",
    # "isOptional", "isUnique", "defaultValue",
    # "typeName", "displayName", "valuesMinCount",
    # "valuesMaxCount", "cardinality", "includeInNotification",
    # "indexType", "isIndexable"
    json_rows = [{
        "Entity TypeName": "demoType",
        "name": "attrib1",
        "description": "Some desc",
        "isOptional": "True",
        "isUnique": "False",
        "defaultValue": None,
        "typeName": "string",
        "displayName": None,
        "valuesMinCount": None,
        "valuesMaxCount": None,
        "cardinality": None,
        "includeInNotification": None,
        "indexType": None,
        "isIndexable": None
    }]

    results = reader.parse_entity_defs(json_rows)

    assert ("entityDefs" in results)
    assert (len(results["entityDefs"]) == 1)
    assert (results["entityDefs"][0]["attributeDefs"][0]["name"] == "attrib1")
Example #7
0
def test_parse_classification_defs():
    rc = ReaderConfiguration()
    reader = Reader(rc)

    json_rows = [{
        "classificationName": "testClassification",
        "entityTypes": None,
        "description": "This is my classification"
    }, {
        "classificationName": "testClassification2",
        "entityTypes": "",
        "description": "This is my classification2"
    }, {
        "classificationName": "testClassification3",
        "entityTypes": "DataSet;Process",
        "description": "This is my classification3"
    }, {
        "classificationName": "testClassification4",
        "entityTypes": "DataSet;",
        "description": "This is my classification4"
    }]

    parsed = reader.parse_classification_defs(json_rows)

    results = parsed["classificationDefs"]

    assert (len(results) == 4)
    assert ("description" in results[0])
    assert (results[0]["name"] == "testClassification")
    assert (len(results[0]["entityTypes"]) == 0)
    assert (len(results[1]["entityTypes"]) == 0)
    assert (len(results[2]["entityTypes"]) == 2)
    assert (len(results[3]["entityTypes"]) == 1)
Example #8
0
def test_parse_bulk_entities_with_terms():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    # "typeName", "name",
    # "qualifiedName", "classifications",
    # "[Relationship] table"
    json_rows = [{
        "typeName": "demo_table",
        "name": "entityNameABC",
        "qualifiedName": "qualifiedNameofEntityNameABC",
        "classifications": None,
        "[Relationship] meanings": "My Term;abc"
    }, {
        "typeName": "demo_table",
        "name": "entityNameDEF",
        "qualifiedName": "qualifiedNameofEntityNameDEF",
        "classifications": None,
        "[Relationship] meanings": None
    }]
    results = reader.parse_bulk_entities(json_rows)
    ae1 = results["entities"][0]
    ae2 = results["entities"][1]

    assert ("meanings" in ae1["relationshipAttributes"])
    assert ("meanings" not in ae2["relationshipAttributes"])
    ae1_meanings = ae1["relationshipAttributes"]["meanings"]

    assert (len(ae1_meanings) == 2)
    ae1_meanings_qns = set(
        [e["uniqueAttributes"]["qualifiedName"] for e in ae1_meanings])
    assert (set(["My Term@Glossary", "abc@Glossary"]) == ae1_meanings_qns)
Example #9
0
def test_parse_bulk_entities_with_relationships():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    # "typeName", "name",
    # "qualifiedName", "classifications",
    # "[Relationship] table"
    json_rows = [
        {"typeName": "demo_table", "name": "entityNameABC",
         "qualifiedName": "qualifiedNameofEntityNameABC", "classifications": None,
         "[Relationship] table": None
         },
        {"typeName": "demo_column", "name": "col1",
         "qualifiedName": "col1qn", "classifications": None,
         "[Relationship] table": "qualifiedNameofEntityNameABC"
         },
         {"typeName": "demo_column", "name": "col2",
         "qualifiedName": "col2qn", "classifications": None,
         "[Relationship] table": None
         }
    ]
    results = reader.parse_bulk_entities(json_rows)
    abc = results["entities"][0]
    col1 = results["entities"][1]
    col2 = results["entities"][2]

    assert("table" in col1["relationshipAttributes"])
    col1_table = col1["relationshipAttributes"]["table"]
    assert(col1_table["typeName"] == "demo_table" )
    assert(col1_table["qualifiedName"] == "qualifiedNameofEntityNameABC" )

    assert("table" not in col2["relationshipAttributes"])
Example #10
0
def test_parse_bulk_entities():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    # "typeName", "name",
    # "qualifiedName", "classifications"
    json_rows = [{
        "typeName": "demoType",
        "name": "entityNameABC",
        "qualifiedName": "qualifiedNameofEntityNameABC",
        "classifications": None
    }, {
        "typeName": "demoType",
        "name": "entityNameGHI",
        "qualifiedName": "qualifiedNameofEntityNameGHI",
        "classifications": "PII;CLASS2"
    }, {
        "typeName": "demoType",
        "name": "entityNameJKL",
        "qualifiedName": "qualifiedNameofEntityNameJKL",
        "classifications": "PII"
    }, {
        "typeName": "demoType",
        "name": "entityNameDynamic",
        "qualifiedName": "qualifiedNameofEntityNameDynamic",
        "classifications": None,
        "dynamicAttrib1": "foo",
        "dynamicAttrib2": "bar"
    }]
    results = reader.parse_bulk_entities(json_rows)

    assert ("entities" in results)
    assert (len(results["entities"]) == len(json_rows))
    abc = results["entities"][0]
    ghi = results["entities"][1]
    jkl = results["entities"][2]
    dynamic = results["entities"][3]

    assert ("classifications" not in abc)
    assert (len(ghi["classifications"]) == 2)
    assert (len(jkl["classifications"]) == 1)

    # The classifications should default to NOT propagate
    assert (all([
        c["propagate"] == False
        for c in ghi["classifications"] + jkl["classifications"]
    ]))

    assert (jkl["classifications"][0]["typeName"] == "PII")
    ghi_classification_types = set(
        [x["typeName"] for x in ghi["classifications"]])
    assert (set(["PII", "CLASS2"]) == ghi_classification_types)

    assert ("dynamicAttrib1" in dynamic["attributes"])
    assert (dynamic["attributes"]["dynamicAttrib1"] == "foo")
    assert ("dynamicAttrib2" in dynamic["attributes"])
    assert (dynamic["attributes"]["dynamicAttrib2"] == "bar")
Example #11
0
def test_parse_entity_defs_extended():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    json_rows = [{
        "Entity TypeName": "generic",
        "name": "attrib1",
        "description": "desc1",
        "isOptional": "True",
        "isUnique": "False",
        "defaultValue": None
    }, {
        "Entity TypeName": "generic",
        "name": "attrib2",
        "description": "desc2",
        "isOptional": "True",
        "isUnique": "False",
        "defaultValue": None,
        "cardinality": "SINGLE"
    }, {
        "Entity TypeName": "demo",
        "name": "attrib3",
        "description": "desc3",
        "isOptional": "False",
        "isUnique": "False",
        "cardinality": "SET"
    }]

    output = reader.parse_entity_defs(json_rows)
    # It is an AtlasTypesDef composite wrapper
    assert ("entityDefs" in output.keys())
    # There are two entity typenames specified so there should be only two entityDefs
    assert (len(output["entityDefs"]) == 2)

    genericEntityDef = None
    demoEntityDef = None

    for entityDef in output["entityDefs"]:
        if entityDef["name"] == "generic":
            genericEntityDef = entityDef
        elif entityDef["name"] == "demo":
            demoEntityDef = entityDef

    # Generic has two attributes
    assert (len(genericEntityDef["attributeDefs"]) == 2)

    # Demo has one attribute
    assert (len(demoEntityDef["attributeDefs"]) == 1)

    assert (demoEntityDef["attributeDefs"][0] == AtlasAttributeDef(
        name="attrib3",
        **{
            "description": "desc3",
            "isOptional": "False",
            "isUnique": "False",
            "cardinality": "SET"
        }).to_json())
Example #12
0
def test_entityDefs_warns_with_extra_params():
    rc = ReaderConfiguration()
    reader = Reader(rc)
    # All attribute keys should be converted to camel case except "Entity TypeName"
    inputData = [{
        "Entity TypeName": "generic",
        "name": "attrib1",
        "description": "desc1",
        "isOptional": "True",
        "isUnique": "False",
        "defaultValue": None
    }, {
        "Entity TypeName": "generic",
        "name": "attrib2",
        "description": "desc2",
        "isOptional": "True",
        "isUnique": "False",
        "defaultValue": None,
        "cardinality": "SINGLE",
        "randomAttrib": "foobar"
    }]

    # Assert that a UserWarning occurs when adding an extra attribute
    pytest.warns(UserWarning, reader.parse_entity_defs,
                 **{"json_rows": inputData})
Example #13
0
def test_table_lineage_with_attributes():
    reader = Reader(READER_CONFIG)
    json_rows = [{
        "Target table": "table1",
        "Target type": "demo_type",
        "Target data_type": "str",
        "Source table": "table0",
        "Source type": "demo_type2",
        "Source foo": "bar",
        "Process name": "proc01",
        "Process type": "proc_type",
        "Process fizz": "buzz"
    }]

    results = reader.parse_table_lineage(json_rows)

    assert (results[0].attributes["data_type"] == "str")
    assert (results[1].attributes["foo"] == "bar")
    assert (results[2].attributes["fizz"] == "buzz")
Example #14
0
def test_column_lineage_entities_with_columnMapping():
    reader = Reader(READER_CONFIG)
    expected_obj = [{
        "ColumnMapping": [{
            "Source": "col0",
            "Sink": "col1"
        }, {
            "Source": "col90",
            "Sink": "col99"
        }],
        "DatasetMapping": {
            "Source": "table0",
            "Sink": "table1"
        }
    }]
    # "[{\"ColumnMapping\": [{\"Source\": \"col0\", \"Sink\": \"col1\"}], \"DatasetMapping\": {\"Source\": \"table0\", \"Sink\": \"table1\"}}]"
    expected = json.dumps(expected_obj)

    json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities()

    json_columns.append({
        "Target column": "col99",
        "Target table": "table1",
        "Source column": "col90",
        "Source table": "table0",
        "transformation": "col90 + 1"
    })

    # Outputs -1003 as the last guid
    tables_and_processes = reader.parse_table_lineage(json_tables)

    results = reader.parse_column_lineage(json_columns,
                                          tables_and_processes,
                                          atlas_typedefs,
                                          use_column_mapping=True)

    # Demonstrating column lineage
    assert ("columnMapping" in tables_and_processes[2].attributes)
    assert (tables_and_processes[2].attributes["columnMapping"] == expected)
Example #15
0
def test_bulk_entity_with_experts_owners():
    rc = ReaderConfiguration()
    reader = Reader(rc)

    json_rows = [{
        "typeName": "demoType",
        "name": "entityNameABC",
        "qualifiedName": "qualifiedNameofEntityNameABC",
        "classifications": None,
        "experts": "a;b;",
        "owners": ""
    }, {
        "typeName": "demoType",
        "name": "entityNameGHI",
        "qualifiedName": "qualifiedNameofEntityNameGHI",
        "classifications": None,
        "experts": "a;b;",
        "owners": "c;d"
    }, {
        "typeName": "demoType",
        "name": "entityNameJKL",
        "qualifiedName": "qualifiedNameofEntityNameJKL",
        "classifications": None
    }]

    results = reader.parse_bulk_entities(json_rows)

    assert ("contacts" in results["entities"][0])
    exp_only = results["entities"][0]["contacts"]
    both = results["entities"][1]["contacts"]
    no_contacts = results["entities"][2]

    assert (len(exp_only["Owner"]) == 0)
    assert (len(exp_only["Expert"]) == 2)
    assert (len(both["Owner"]) == 2)
    assert (len(both["Expert"]) == 2)
    assert ("contacts" not in no_contacts)
Example #16
0
def test_table_lineage_multiple_inputs():
    reader = Reader(READER_CONFIG)
    json_tables = [{
        "Target table": "table1",
        "Target type": "demo_type",
        "Source table": "table0",
        "Source type": "demo_type",
        "Process name": "proc01",
        "Process type": "proc_type"
    }, {
        "Target table": "table1",
        "Target type": "demo_type",
        "Source table": "tableB",
        "Source type": "demo_type",
        "Process name": "proc01",
        "Process type": "proc_type"
    }]

    results = reader.parse_table_lineage(json_rows=json_tables)

    assert (len(results) == 4)
    assert (results[3].to_json(minimum=True) == {
        "typeName": "proc_type",
        "guid": -1003,
        "qualifiedName": "proc01"
    })
    process_inputs_qualified_names = [
        p["qualifiedName"] for p in results[3].inputs
    ]
    process_outputs_qualified_names = [
        p["qualifiedName"] for p in results[3].outputs
    ]
    assert (len(process_inputs_qualified_names) == 2)
    assert (len(process_outputs_qualified_names) == 1)

    assert (set(process_inputs_qualified_names) == set(["table0", "tableB"]))
    assert (set(process_outputs_qualified_names) == set(["table1"]))
Example #17
0
def test_column_lineage_entities():
    reader = Reader(READER_CONFIG)

    json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities()

    # Outputs -1003 as the last guid
    tables_and_processes = reader.parse_table_lineage(json_tables)

    results = reader.parse_column_lineage(json_columns, tables_and_processes,
                                          atlas_typedefs)

    # Two column entities
    # One process entity
    target_col_entity = results[0].to_json()
    source_col_entity = results[1].to_json()
    col_lineage_entity = results[2].to_json()

    assert (target_col_entity["typeName"] == "demo_column")
    assert (target_col_entity["relationshipAttributes"]["table"]["typeName"] ==
            "demo_table")
    assert (source_col_entity["typeName"] == "demo_column")
    assert (source_col_entity["relationshipAttributes"]["table"]["typeName"] ==
            "demo_table")
    assert (col_lineage_entity["typeName"] == "demo_column_lineage")

    for entity in col_lineage_entity["attributes"][
            "inputs"] + col_lineage_entity["attributes"]["outputs"]:
        assert (entity["typeName"] == "demo_column")

    # Check that this points to the correct table process with a (default) query reference in relationshipAttribs
    proc_relationship_query_is_demo_process = False
    assert ("query" in col_lineage_entity["relationshipAttributes"])
    if "query" in col_lineage_entity["relationshipAttributes"]:
        proc_relationship_query_is_demo_process = col_lineage_entity[
            "relationshipAttributes"]["query"]["typeName"] == "demo_process"
    assert (proc_relationship_query_is_demo_process)
Example #18
0
def test_column_lineage_entities_when_multi_tabled_inputs():
    reader = Reader(READER_CONFIG)
    json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities()
    # Adding in an extra table
    json_tables.append({
        "Target table": "table1",
        "Target type": "demo_table",
        "Source table": "tableB",
        "Source type": "demo_table",
        "Process name": "proc01",
        "Process type": "demo_process"
    })
    json_columns[0].update({"transformation": "colB + col0"})
    # Adding in an extra column
    json_columns.append({
        "Target column": "col1",
        "Target table": "table1",
        "Source column": "colB",
        "Source table": "tableB",
        "transformation": "colB + col0"
    })
    expected_col_map_obj = [{
        "ColumnMapping": [{
            "Source": "col0",
            "Sink": "col1"
        }],
        "DatasetMapping": {
            "Source": "table0",
            "Sink": "table1"
        }
    }, {
        "ColumnMapping": [{
            "Source": "colB",
            "Sink": "col1"
        }],
        "DatasetMapping": {
            "Source": "tableB",
            "Sink": "table1"
        }
    }]

    table_entities = reader.parse_table_lineage(json_tables)
    column_entities = reader.parse_column_lineage(json_columns,
                                                  table_entities,
                                                  atlas_typedefs,
                                                  use_column_mapping=True)

    # Three columns and one process entity
    assert (len(column_entities) == 4)
    process_entities = [
        e for e in column_entities if isinstance(e, AtlasProcess)
    ]
    assert (len(process_entities) == 1)
    process_entity = process_entities[0]

    process_inputs_qualified_names = [
        p["qualifiedName"] for p in process_entity.inputs
    ]
    process_outputs_qualified_names = [
        p["qualifiedName"] for p in process_entity.outputs
    ]
    assert (len(process_inputs_qualified_names) == 2)
    assert (len(process_outputs_qualified_names) == 1)

    assert (set(process_inputs_qualified_names) == set(
        ["table0#col0", "tableB#colB"]))
    assert (set(process_outputs_qualified_names) == set(["table1#col1"]))

    table_process_entities = [
        e for e in table_entities if isinstance(e, AtlasProcess)
    ]
    table_process_entity = table_process_entities[0]
    # Should now contain the expected column Mappings
    assert ("columnMapping" in table_process_entity.attributes)
    resulting_colmap = json.loads(
        table_process_entity.attributes["columnMapping"])
    assert (len(expected_col_map_obj) == len(resulting_colmap))
    assert (all([res in expected_col_map_obj for res in resulting_colmap]))
Example #19
0
def test_parse_update_lineage():
    reader = Reader(READER_CONFIG)
    json_rows = [{
        "Target typeName": "demo_table",
        "Target qualifiedName": "demotarget",
        "Source typeName": "demo_table2",
        "Source qualifiedName": "demosource",
        "Process name": "proc01",
        "Process qualifiedName": "procqual01",
        "Process typeName": "Process2"
    }, {
        "Target typeName": "demo_table",
        "Target qualifiedName": "demotarget02",
        "Source typeName": None,
        "Source qualifiedName": None,
        "Process name": "proc02",
        "Process qualifiedName": "procqual02",
        "Process typeName": "Process3"
    }, {
        "Target typeName": None,
        "Target qualifiedName": None,
        "Source typeName": "demo_table2",
        "Source qualifiedName": "demosource03",
        "Process name": "proc03",
        "Process qualifiedName": "procqual03",
        "Process typeName": "Process4"
    }, {
        "Target typeName": "N/A",
        "Target qualifiedName": "N/A",
        "Source typeName": "demo_table2",
        "Source qualifiedName": "demosource03",
        "Process name": "proc04",
        "Process qualifiedName": "procqual04",
        "Process typeName": "Process5"
    }]

    results = reader.parse_update_lineage(json_rows)

    assert (len(results) == 4)
    full_update = results[0]
    target_update = results[1]
    source_update = results[2]
    target_destroy = results[3]

    assert (full_update["typeName"] == "Process2")
    assert (full_update["attributes"]["name"] == "proc01")
    assert (len(full_update["attributes"]["inputs"]) == 1)
    assert (len(full_update["attributes"]["outputs"]) == 1)

    fullupd_input = full_update["attributes"]["inputs"][0]
    fullupd_output = full_update["attributes"]["outputs"][0]

    assert (fullupd_input == {
        "typeName": "demo_table2",
        "uniqueAttributes": {
            "qualifiedName": "demosource"
        }
    })
    assert (fullupd_output == {
        "typeName": "demo_table",
        "uniqueAttributes": {
            "qualifiedName": "demotarget"
        }
    })

    # For a partial update, inputs will be set to None
    assert (target_update["attributes"]["inputs"] == None)

    # For a partial update, outputs will be set to None
    assert (source_update["attributes"]["outputs"] == None)

    # If they use the "N/A" keyword in qualifiedName, destroy that type
    assert (target_destroy["attributes"]["outputs"] == [])
    assert (target_destroy["attributes"]["inputs"] == [{
        "typeName": "demo_table2",
        "uniqueAttributes": {
            "qualifiedName": "demosource03"
        }
    }])