def test_column_lineage_entities_with_classifications(): reader = Reader(READER_CONFIG) json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities() # Update target to include a classification json_columns[0].update({ "Target classifications": "CustomerInfo; PII", "Source classifications": "" }) # Outputs -1003 as the last guid tables_and_processes = reader.parse_table_lineage(json_tables) results = reader.parse_column_lineage(json_columns, tables_and_processes, atlas_typedefs) # Two column entities # One process entity target_col_entity = results[0] source_col_entity = results[1] col_lineage_entity = results[2] assert (len(target_col_entity.classifications) == 2) assert ({ "typeName": "CustomerInfo", "attributes": {} } in target_col_entity.classifications) assert ({ "typeName": "PII", "attributes": {} } in target_col_entity.classifications) assert (len(source_col_entity.classifications) == 0)
def test_parse_update_lineage_multi_out(): reader = Reader(READER_CONFIG) json_rows = [{ "Target typeName": "demo_table", "Target qualifiedName": "demotarget", "Source typeName": "demo_table2", "Source qualifiedName": "demosource", "Process name": "proc01", "Process qualifiedName": "procqual01", "Process typeName": "Process2" }, { "Target typeName": "demo_table", "Target qualifiedName": "demotarget2", "Source typeName": None, "Source qualifiedName": None, "Process name": "proc01", "Process qualifiedName": "procqual01", "Process typeName": "Process2" }] results = reader.parse_update_lineage(json_rows) assert (len(results) == 1) inputs = results[0]["attributes"]["inputs"] outputs = results[0]["attributes"]["outputs"] assert (len(outputs) == 2) output_names = set( [x["uniqueAttributes"]["qualifiedName"] for x in outputs]) assert (output_names == set(["demotarget", "demotarget2"])) assert (len(inputs) == 1)
def test_parse_update_lineage_multi_row_with_na_last(): reader = Reader(READER_CONFIG) json_rows = [ { "Target typeName": "demo_table", "Target qualifiedName": "demotarget", "Source typeName": "demo_table2", "Source qualifiedName": "demosource", "Process name": "proc01", "Process qualifiedName": "procqual01", "Process typeName": "Process2" }, { "Target typeName": "N/A", "Target qualifiedName": "N/A", "Source typeName": "demo_table2", "Source qualifiedName": "demosource2", "Process name": "proc01", "Process qualifiedName": "procqual01", "Process typeName": "Process2" }, ] with pytest.warns(UserWarning): results = reader.parse_update_lineage(json_rows) assert (len(results) == 1) inputs = results[0]["attributes"]["inputs"] outputs = results[0]["attributes"]["outputs"] assert (len(outputs) == 0) assert (len(inputs) == 2)
def test_column_lineage_entities_with_attributes(): reader = Reader(READER_CONFIG) json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities() # Update target to include an attribute json_columns[0].update({ "Target test_attrib1": "value", "Target test_attrib2": "value2", "Source foo": "bar" }) # Outputs -1003 as the last guid tables_and_processes = reader.parse_table_lineage(json_tables) results = reader.parse_column_lineage(json_columns, tables_and_processes, atlas_typedefs) # Two column entities # One process entity target_col_entity = results[0] source_col_entity = results[1] col_lineage_entity = results[2] assert (target_col_entity.attributes["test_attrib1"] == "value") assert (target_col_entity.attributes["test_attrib2"] == "value2") assert (source_col_entity.attributes["foo"] == "bar")
def test_table_lineage(): reader = Reader(READER_CONFIG) json_rows = [{ "Target table": "table1", "Target type": "demo_type", "Source table": "table0", "Source type": "demo_type2", "Process name": "proc01", "Process type": "proc_type" }] results = reader.parse_table_lineage(json_rows) assert (results[0].to_json(minimum=True) == { "typeName": "demo_type", "guid": -1001, "qualifiedName": "table1" }) assert (results[1].to_json(minimum=True) == { "typeName": "demo_type2", "guid": -1002, "qualifiedName": "table0" }) assert (results[2].to_json(minimum=True) == { "typeName": "proc_type", "guid": -1003, "qualifiedName": "proc01" })
def test_parse_entity_defs(): rc = ReaderConfiguration() reader = Reader(rc) # "Entity TypeName", "name", "description", # "isOptional", "isUnique", "defaultValue", # "typeName", "displayName", "valuesMinCount", # "valuesMaxCount", "cardinality", "includeInNotification", # "indexType", "isIndexable" json_rows = [{ "Entity TypeName": "demoType", "name": "attrib1", "description": "Some desc", "isOptional": "True", "isUnique": "False", "defaultValue": None, "typeName": "string", "displayName": None, "valuesMinCount": None, "valuesMaxCount": None, "cardinality": None, "includeInNotification": None, "indexType": None, "isIndexable": None }] results = reader.parse_entity_defs(json_rows) assert ("entityDefs" in results) assert (len(results["entityDefs"]) == 1) assert (results["entityDefs"][0]["attributeDefs"][0]["name"] == "attrib1")
def test_parse_classification_defs(): rc = ReaderConfiguration() reader = Reader(rc) json_rows = [{ "classificationName": "testClassification", "entityTypes": None, "description": "This is my classification" }, { "classificationName": "testClassification2", "entityTypes": "", "description": "This is my classification2" }, { "classificationName": "testClassification3", "entityTypes": "DataSet;Process", "description": "This is my classification3" }, { "classificationName": "testClassification4", "entityTypes": "DataSet;", "description": "This is my classification4" }] parsed = reader.parse_classification_defs(json_rows) results = parsed["classificationDefs"] assert (len(results) == 4) assert ("description" in results[0]) assert (results[0]["name"] == "testClassification") assert (len(results[0]["entityTypes"]) == 0) assert (len(results[1]["entityTypes"]) == 0) assert (len(results[2]["entityTypes"]) == 2) assert (len(results[3]["entityTypes"]) == 1)
def test_parse_bulk_entities_with_terms(): rc = ReaderConfiguration() reader = Reader(rc) # "typeName", "name", # "qualifiedName", "classifications", # "[Relationship] table" json_rows = [{ "typeName": "demo_table", "name": "entityNameABC", "qualifiedName": "qualifiedNameofEntityNameABC", "classifications": None, "[Relationship] meanings": "My Term;abc" }, { "typeName": "demo_table", "name": "entityNameDEF", "qualifiedName": "qualifiedNameofEntityNameDEF", "classifications": None, "[Relationship] meanings": None }] results = reader.parse_bulk_entities(json_rows) ae1 = results["entities"][0] ae2 = results["entities"][1] assert ("meanings" in ae1["relationshipAttributes"]) assert ("meanings" not in ae2["relationshipAttributes"]) ae1_meanings = ae1["relationshipAttributes"]["meanings"] assert (len(ae1_meanings) == 2) ae1_meanings_qns = set( [e["uniqueAttributes"]["qualifiedName"] for e in ae1_meanings]) assert (set(["My Term@Glossary", "abc@Glossary"]) == ae1_meanings_qns)
def test_parse_bulk_entities_with_relationships(): rc = ReaderConfiguration() reader = Reader(rc) # "typeName", "name", # "qualifiedName", "classifications", # "[Relationship] table" json_rows = [ {"typeName": "demo_table", "name": "entityNameABC", "qualifiedName": "qualifiedNameofEntityNameABC", "classifications": None, "[Relationship] table": None }, {"typeName": "demo_column", "name": "col1", "qualifiedName": "col1qn", "classifications": None, "[Relationship] table": "qualifiedNameofEntityNameABC" }, {"typeName": "demo_column", "name": "col2", "qualifiedName": "col2qn", "classifications": None, "[Relationship] table": None } ] results = reader.parse_bulk_entities(json_rows) abc = results["entities"][0] col1 = results["entities"][1] col2 = results["entities"][2] assert("table" in col1["relationshipAttributes"]) col1_table = col1["relationshipAttributes"]["table"] assert(col1_table["typeName"] == "demo_table" ) assert(col1_table["qualifiedName"] == "qualifiedNameofEntityNameABC" ) assert("table" not in col2["relationshipAttributes"])
def test_parse_bulk_entities(): rc = ReaderConfiguration() reader = Reader(rc) # "typeName", "name", # "qualifiedName", "classifications" json_rows = [{ "typeName": "demoType", "name": "entityNameABC", "qualifiedName": "qualifiedNameofEntityNameABC", "classifications": None }, { "typeName": "demoType", "name": "entityNameGHI", "qualifiedName": "qualifiedNameofEntityNameGHI", "classifications": "PII;CLASS2" }, { "typeName": "demoType", "name": "entityNameJKL", "qualifiedName": "qualifiedNameofEntityNameJKL", "classifications": "PII" }, { "typeName": "demoType", "name": "entityNameDynamic", "qualifiedName": "qualifiedNameofEntityNameDynamic", "classifications": None, "dynamicAttrib1": "foo", "dynamicAttrib2": "bar" }] results = reader.parse_bulk_entities(json_rows) assert ("entities" in results) assert (len(results["entities"]) == len(json_rows)) abc = results["entities"][0] ghi = results["entities"][1] jkl = results["entities"][2] dynamic = results["entities"][3] assert ("classifications" not in abc) assert (len(ghi["classifications"]) == 2) assert (len(jkl["classifications"]) == 1) # The classifications should default to NOT propagate assert (all([ c["propagate"] == False for c in ghi["classifications"] + jkl["classifications"] ])) assert (jkl["classifications"][0]["typeName"] == "PII") ghi_classification_types = set( [x["typeName"] for x in ghi["classifications"]]) assert (set(["PII", "CLASS2"]) == ghi_classification_types) assert ("dynamicAttrib1" in dynamic["attributes"]) assert (dynamic["attributes"]["dynamicAttrib1"] == "foo") assert ("dynamicAttrib2" in dynamic["attributes"]) assert (dynamic["attributes"]["dynamicAttrib2"] == "bar")
def test_parse_entity_defs_extended(): rc = ReaderConfiguration() reader = Reader(rc) json_rows = [{ "Entity TypeName": "generic", "name": "attrib1", "description": "desc1", "isOptional": "True", "isUnique": "False", "defaultValue": None }, { "Entity TypeName": "generic", "name": "attrib2", "description": "desc2", "isOptional": "True", "isUnique": "False", "defaultValue": None, "cardinality": "SINGLE" }, { "Entity TypeName": "demo", "name": "attrib3", "description": "desc3", "isOptional": "False", "isUnique": "False", "cardinality": "SET" }] output = reader.parse_entity_defs(json_rows) # It is an AtlasTypesDef composite wrapper assert ("entityDefs" in output.keys()) # There are two entity typenames specified so there should be only two entityDefs assert (len(output["entityDefs"]) == 2) genericEntityDef = None demoEntityDef = None for entityDef in output["entityDefs"]: if entityDef["name"] == "generic": genericEntityDef = entityDef elif entityDef["name"] == "demo": demoEntityDef = entityDef # Generic has two attributes assert (len(genericEntityDef["attributeDefs"]) == 2) # Demo has one attribute assert (len(demoEntityDef["attributeDefs"]) == 1) assert (demoEntityDef["attributeDefs"][0] == AtlasAttributeDef( name="attrib3", **{ "description": "desc3", "isOptional": "False", "isUnique": "False", "cardinality": "SET" }).to_json())
def test_entityDefs_warns_with_extra_params(): rc = ReaderConfiguration() reader = Reader(rc) # All attribute keys should be converted to camel case except "Entity TypeName" inputData = [{ "Entity TypeName": "generic", "name": "attrib1", "description": "desc1", "isOptional": "True", "isUnique": "False", "defaultValue": None }, { "Entity TypeName": "generic", "name": "attrib2", "description": "desc2", "isOptional": "True", "isUnique": "False", "defaultValue": None, "cardinality": "SINGLE", "randomAttrib": "foobar" }] # Assert that a UserWarning occurs when adding an extra attribute pytest.warns(UserWarning, reader.parse_entity_defs, **{"json_rows": inputData})
def test_table_lineage_with_attributes(): reader = Reader(READER_CONFIG) json_rows = [{ "Target table": "table1", "Target type": "demo_type", "Target data_type": "str", "Source table": "table0", "Source type": "demo_type2", "Source foo": "bar", "Process name": "proc01", "Process type": "proc_type", "Process fizz": "buzz" }] results = reader.parse_table_lineage(json_rows) assert (results[0].attributes["data_type"] == "str") assert (results[1].attributes["foo"] == "bar") assert (results[2].attributes["fizz"] == "buzz")
def test_column_lineage_entities_with_columnMapping(): reader = Reader(READER_CONFIG) expected_obj = [{ "ColumnMapping": [{ "Source": "col0", "Sink": "col1" }, { "Source": "col90", "Sink": "col99" }], "DatasetMapping": { "Source": "table0", "Sink": "table1" } }] # "[{\"ColumnMapping\": [{\"Source\": \"col0\", \"Sink\": \"col1\"}], \"DatasetMapping\": {\"Source\": \"table0\", \"Sink\": \"table1\"}}]" expected = json.dumps(expected_obj) json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities() json_columns.append({ "Target column": "col99", "Target table": "table1", "Source column": "col90", "Source table": "table0", "transformation": "col90 + 1" }) # Outputs -1003 as the last guid tables_and_processes = reader.parse_table_lineage(json_tables) results = reader.parse_column_lineage(json_columns, tables_and_processes, atlas_typedefs, use_column_mapping=True) # Demonstrating column lineage assert ("columnMapping" in tables_and_processes[2].attributes) assert (tables_and_processes[2].attributes["columnMapping"] == expected)
def test_bulk_entity_with_experts_owners(): rc = ReaderConfiguration() reader = Reader(rc) json_rows = [{ "typeName": "demoType", "name": "entityNameABC", "qualifiedName": "qualifiedNameofEntityNameABC", "classifications": None, "experts": "a;b;", "owners": "" }, { "typeName": "demoType", "name": "entityNameGHI", "qualifiedName": "qualifiedNameofEntityNameGHI", "classifications": None, "experts": "a;b;", "owners": "c;d" }, { "typeName": "demoType", "name": "entityNameJKL", "qualifiedName": "qualifiedNameofEntityNameJKL", "classifications": None }] results = reader.parse_bulk_entities(json_rows) assert ("contacts" in results["entities"][0]) exp_only = results["entities"][0]["contacts"] both = results["entities"][1]["contacts"] no_contacts = results["entities"][2] assert (len(exp_only["Owner"]) == 0) assert (len(exp_only["Expert"]) == 2) assert (len(both["Owner"]) == 2) assert (len(both["Expert"]) == 2) assert ("contacts" not in no_contacts)
def test_table_lineage_multiple_inputs(): reader = Reader(READER_CONFIG) json_tables = [{ "Target table": "table1", "Target type": "demo_type", "Source table": "table0", "Source type": "demo_type", "Process name": "proc01", "Process type": "proc_type" }, { "Target table": "table1", "Target type": "demo_type", "Source table": "tableB", "Source type": "demo_type", "Process name": "proc01", "Process type": "proc_type" }] results = reader.parse_table_lineage(json_rows=json_tables) assert (len(results) == 4) assert (results[3].to_json(minimum=True) == { "typeName": "proc_type", "guid": -1003, "qualifiedName": "proc01" }) process_inputs_qualified_names = [ p["qualifiedName"] for p in results[3].inputs ] process_outputs_qualified_names = [ p["qualifiedName"] for p in results[3].outputs ] assert (len(process_inputs_qualified_names) == 2) assert (len(process_outputs_qualified_names) == 1) assert (set(process_inputs_qualified_names) == set(["table0", "tableB"])) assert (set(process_outputs_qualified_names) == set(["table1"]))
def test_column_lineage_entities(): reader = Reader(READER_CONFIG) json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities() # Outputs -1003 as the last guid tables_and_processes = reader.parse_table_lineage(json_tables) results = reader.parse_column_lineage(json_columns, tables_and_processes, atlas_typedefs) # Two column entities # One process entity target_col_entity = results[0].to_json() source_col_entity = results[1].to_json() col_lineage_entity = results[2].to_json() assert (target_col_entity["typeName"] == "demo_column") assert (target_col_entity["relationshipAttributes"]["table"]["typeName"] == "demo_table") assert (source_col_entity["typeName"] == "demo_column") assert (source_col_entity["relationshipAttributes"]["table"]["typeName"] == "demo_table") assert (col_lineage_entity["typeName"] == "demo_column_lineage") for entity in col_lineage_entity["attributes"][ "inputs"] + col_lineage_entity["attributes"]["outputs"]: assert (entity["typeName"] == "demo_column") # Check that this points to the correct table process with a (default) query reference in relationshipAttribs proc_relationship_query_is_demo_process = False assert ("query" in col_lineage_entity["relationshipAttributes"]) if "query" in col_lineage_entity["relationshipAttributes"]: proc_relationship_query_is_demo_process = col_lineage_entity[ "relationshipAttributes"]["query"]["typeName"] == "demo_process" assert (proc_relationship_query_is_demo_process)
def test_column_lineage_entities_when_multi_tabled_inputs(): reader = Reader(READER_CONFIG) json_tables, json_columns, atlas_typedefs = setup_column_lineage_entities() # Adding in an extra table json_tables.append({ "Target table": "table1", "Target type": "demo_table", "Source table": "tableB", "Source type": "demo_table", "Process name": "proc01", "Process type": "demo_process" }) json_columns[0].update({"transformation": "colB + col0"}) # Adding in an extra column json_columns.append({ "Target column": "col1", "Target table": "table1", "Source column": "colB", "Source table": "tableB", "transformation": "colB + col0" }) expected_col_map_obj = [{ "ColumnMapping": [{ "Source": "col0", "Sink": "col1" }], "DatasetMapping": { "Source": "table0", "Sink": "table1" } }, { "ColumnMapping": [{ "Source": "colB", "Sink": "col1" }], "DatasetMapping": { "Source": "tableB", "Sink": "table1" } }] table_entities = reader.parse_table_lineage(json_tables) column_entities = reader.parse_column_lineage(json_columns, table_entities, atlas_typedefs, use_column_mapping=True) # Three columns and one process entity assert (len(column_entities) == 4) process_entities = [ e for e in column_entities if isinstance(e, AtlasProcess) ] assert (len(process_entities) == 1) process_entity = process_entities[0] process_inputs_qualified_names = [ p["qualifiedName"] for p in process_entity.inputs ] process_outputs_qualified_names = [ p["qualifiedName"] for p in process_entity.outputs ] assert (len(process_inputs_qualified_names) == 2) assert (len(process_outputs_qualified_names) == 1) assert (set(process_inputs_qualified_names) == set( ["table0#col0", "tableB#colB"])) assert (set(process_outputs_qualified_names) == set(["table1#col1"])) table_process_entities = [ e for e in table_entities if isinstance(e, AtlasProcess) ] table_process_entity = table_process_entities[0] # Should now contain the expected column Mappings assert ("columnMapping" in table_process_entity.attributes) resulting_colmap = json.loads( table_process_entity.attributes["columnMapping"]) assert (len(expected_col_map_obj) == len(resulting_colmap)) assert (all([res in expected_col_map_obj for res in resulting_colmap]))
def test_parse_update_lineage(): reader = Reader(READER_CONFIG) json_rows = [{ "Target typeName": "demo_table", "Target qualifiedName": "demotarget", "Source typeName": "demo_table2", "Source qualifiedName": "demosource", "Process name": "proc01", "Process qualifiedName": "procqual01", "Process typeName": "Process2" }, { "Target typeName": "demo_table", "Target qualifiedName": "demotarget02", "Source typeName": None, "Source qualifiedName": None, "Process name": "proc02", "Process qualifiedName": "procqual02", "Process typeName": "Process3" }, { "Target typeName": None, "Target qualifiedName": None, "Source typeName": "demo_table2", "Source qualifiedName": "demosource03", "Process name": "proc03", "Process qualifiedName": "procqual03", "Process typeName": "Process4" }, { "Target typeName": "N/A", "Target qualifiedName": "N/A", "Source typeName": "demo_table2", "Source qualifiedName": "demosource03", "Process name": "proc04", "Process qualifiedName": "procqual04", "Process typeName": "Process5" }] results = reader.parse_update_lineage(json_rows) assert (len(results) == 4) full_update = results[0] target_update = results[1] source_update = results[2] target_destroy = results[3] assert (full_update["typeName"] == "Process2") assert (full_update["attributes"]["name"] == "proc01") assert (len(full_update["attributes"]["inputs"]) == 1) assert (len(full_update["attributes"]["outputs"]) == 1) fullupd_input = full_update["attributes"]["inputs"][0] fullupd_output = full_update["attributes"]["outputs"][0] assert (fullupd_input == { "typeName": "demo_table2", "uniqueAttributes": { "qualifiedName": "demosource" } }) assert (fullupd_output == { "typeName": "demo_table", "uniqueAttributes": { "qualifiedName": "demotarget" } }) # For a partial update, inputs will be set to None assert (target_update["attributes"]["inputs"] == None) # For a partial update, outputs will be set to None assert (source_update["attributes"]["outputs"] == None) # If they use the "N/A" keyword in qualifiedName, destroy that type assert (target_destroy["attributes"]["outputs"] == []) assert (target_destroy["attributes"]["inputs"] == [{ "typeName": "demo_table2", "uniqueAttributes": { "qualifiedName": "demosource03" } }])