def test_would_it_overwrite(): entities = [ AtlasEntity("dummy1", "demo_table", "dummy1", -99, attributes={ "req_attrib": "1" }).to_json(), AtlasEntity("dummy2", "demo_table", "dummy1", -100, attributes={ "foo": "bar" }).to_json() ] new_entity = AtlasEntity("dummy1", "demo_table", "dummy1", -99, attributes={ "req_attrib": "1" }).to_json() demo_table_type = {"entityDefs": []} local_what_if = WhatIfValidator(existing_entities=entities) results = local_what_if.entity_would_overwrite(new_entity) assert (results)
def test_purview_search_iterates_safely_over_multiple(): ae = AtlasEntity( name="there_can_be_only_two", qualified_name="pyapacheatlas://there_can_be_only_two_00", guid=-100, typeName="hive_table" ) ae2 = AtlasEntity( name="there_can_be_only_two", qualified_name="pyapacheatlas://there_can_be_only_two_01", guid=-101, typeName="hive_table" ) upload_success = client.upload_entities([ae, ae2]) search_results = client.search_entities(r"there_can_be_only_two") counter = 0 for entity in search_results: len(entity["id"]) counter = counter + 1 assert(counter == 2)
def test_prepare_bulk_entity_from_mixed_atlas_entity_dict(): class_entity = AtlasEntity( name=sample_entity["attributes"]["name"], typeName=sample_entity["typeName"], qualified_name=sample_entity["attributes"]["qualifiedName"], attributes=sample_entity["attributes"], guid=sample_entity["guid"], relationshipAttributes=sample_entity["relationshipAttributes"]) class_entity2 = AtlasEntity( name=sample_entity["attributes"]["name"] + "abc", typeName=sample_entity["typeName"], qualified_name=sample_entity["attributes"]["qualifiedName"] + "abc", attributes=sample_entity["attributes"], guid=sample_entity["guid"], relationshipAttributes=sample_entity["relationshipAttributes"]) results = AtlasClient._prepare_entity_upload( [class_entity, class_entity2.to_json()]) sample2 = sample_entity.copy() sample2["attributes"]["name"] = sample2["attributes"]["name"] + "abc" sample2["attributes"][ "qualifiedName"] = sample2["attributes"]["qualifiedName"] + "abc" expected = {"entities": [sample_entity, sample2]} assert (results == expected)
def test_missing_req_attributes(): entities = [ AtlasEntity("dummy1", "demo_table", "dummy1", -99, attributes={ "req_attrib": "1" }).to_json(), AtlasEntity("dummy2", "demo_table", "dummy1", -100).to_json() ] demo_table_type = { "entityDefs": [{ 'category': 'ENTITY', 'name': 'demo_table', 'attributeDefs': [{ "name": "req_attrib", "isOptional": False }], 'relationshipAttributeDefs': [], 'superTypes': ['DataSet'] }] } local_what_if = WhatIfValidator(demo_table_type) results = [local_what_if.entity_missing_attributes(e) for e in entities] assert (results[0] == False) assert (results[1] == {'req_attrib'})
def test_type_doesnt_exist(): entities = [ AtlasEntity("dummy1", "demo_table", "dummy1", -99).to_json(), AtlasEntity("dummy2", "foobar", "dummy1", -100).to_json() ] results = [whatif.entity_type_exists(e) for e in entities] assert (results[0] == True) assert (results[1] == False)
def test_batches_entities_with_real_guid(): gt = GuidTracker() a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid()) b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid()) b.addRelationship(table=a) c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid()) d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid()) c.addRelationship(tester={"guid": "abc-123"}) entities = [x.to_json() for x in [a, b, c, d]] results = batch_dependent_entities(entities, batch_size=2) assert (len(results) == 2)
def convert_Spline_to_Purview(splineJson): splineJson = json.loads(splineJson) # Get notebook info notebookInfo = splineJson["extraInfo"]["notebookInfo"]["obj"] notebookURL = notebookInfo["notebookURL"].replace("\\", "") guid = GuidTracker() # Get inputs inputs = [] for read in splineJson["operations"]["reads"]: input_path = read["inputSources"][0].replace( notebookInfo["mounts"][0], "https://adldata.dfs.core.windows.net/data/") input = AtlasEntity(name=input_path.split("/")[-1], typeName="azure_datalake_gen2_path", qualified_name=input_path, guid=guid.get_guid()) inputs.append(input) # Get outputs write = splineJson["operations"]["write"] output_path = write["outputSource"].replace( notebookInfo["mounts"][0], "https://adldata.dfs.core.windows.net/data/") output = AtlasEntity(name=output_path.split("/")[-1], typeName="azure_datalake_gen2_path", qualified_name=output_path, guid=guid.get_guid()) # Get Process process_attributes = { "name": notebookInfo["name"], "owner": notebookInfo["user"], "description": f"Link to spark job notebook: http://{notebookURL}", "startTime": notebookInfo["timestamp"], "endTime": notebookInfo["timestamp"] } process = AtlasProcess(name=notebookInfo["name"], typeName="Process", qualified_name=f"adb-{notebookURL[4:20]}", inputs=inputs, outputs=[output], guid=guid.get_guid(), attributes=process_attributes) purview_lineage = inputs + [output] + [process] return purview_lineage
def test_batches_entities_simple(): entities = [ AtlasEntity(str(i), "DataSet", str(i), guid=i).to_json() for i in range(0, 10) ] results = batch_dependent_entities(entities, batch_size=2) assert (len(results) == 5)
def add_atlas_db(self, database, database_fqn): self.guid_count -= 1 db = AtlasEntity(name=database[0], typeName='pyapacheatlas_mysql_db', qualified_name=database_fqn, guid=self.guid_count) self.dbs.append(db) return db
def add_atlas_table(self, table, table_fqn): self.guid_count -= 1 db_table = AtlasEntity(name=table[0], typeName='pyapacheatlas_mysql_table', qualified_name=table_fqn, guid=self.guid_count) self.db_tables.append(db_table) return db_table
def test_first_entity_matching_attribute(): atlas_entities = [ AtlasEntity( name="demoentity", typeName="demo_table", qualified_name="demoentity", guid = -1000 ), AtlasEntity( name="demoentity2", typeName="demo2_table", qualified_name="demoentity2", guid = -1001 ) ] results = first_entity_matching_attribute("name","demoentity", atlas_entities) assert (results.typeName == "demo_table")
def setup_batch_entities(): atlas_entities = [ AtlasEntity( name="demoentity", typeName="demo_table", qualified_name="demoentity", guid = -1000 ), AtlasEntity( name="demoentity2", typeName="demo2_table", qualified_name="demoentity2", guid = -1001 ) ] atlas_proc = AtlasProcess( name="demo_process_name", typeName="demo_process", qualified_name="demo_process_qualifier", inputs=[atlas_entities[0].to_json(minimum=True)], outputs=[atlas_entities[1].to_json(minimum=True)], guid = -1002 ) atlas_proc_no_in = AtlasProcess( name="demo_process_qualifier_no_in", typeName="demo_process1", qualified_name="demo_process_qualifier_no_in", inputs=[], outputs=[atlas_entities[1].to_json(minimum=True)], guid = -1003 ) atlas_proc_no_out = AtlasProcess( name="demo_process_qualifier_no_out", typeName="demo_process2", qualified_name="demo_process_qualifier_no_out", inputs=[atlas_entities[0].to_json(minimum=True)], outputs=[], guid = -1004 ) atlas_entities.extend([atlas_proc, atlas_proc_no_in, atlas_proc_no_out]) return atlas_entities
def test_min_entity_json_no_guid_usage(): ae = AtlasEntity("BeforeModi", "DataSet", "tests://EntityRESTBeforeModification", guid=-1) assignments = client.upload_entities([ae])["guidAssignments"] assign_with_no_guid = {} try: # live_table = client.get_entity(guid=assignments["-1"])["entities"][0] ae_no_guid = AtlasEntity("BeforeModi", "DataSet", "tests://EntityRESTBeforeModification", guid=None) proc1 = AtlasProcess("WillBeUpdatedWithNoGuidEntity", "Process", "tests://EntityRESTBeforeModificationProc", inputs=[ae_no_guid], outputs=[], guid=-2) assign_with_no_guid = client.upload_entities([proc1 ])["guidAssignments"] live_proc = client.get_entity( guid=assign_with_no_guid["-2"])["entities"][0] # Should have one input that matches the guid assignment assert (len(live_proc["attributes"]["inputs"]) == 1) assert ( live_proc["attributes"]["inputs"][0]["guid"] == assignments["-1"]) finally: # Delete the entities now that the test is complete _ = client.delete_entity(assignments["-1"]) if "-2" in assign_with_no_guid: _ = client.delete_entity(assign_with_no_guid.get("-2")) pass
def test_classify_entities(): # Create an entity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity(name="input01", typeName="DataSet", qualified_name="tests://classify_01", guid=-100) input02 = AtlasEntity(name="input02", typeName="DataSet", qualified_name="tests://classify_02", guid=-101) results = client.upload_entities(batch=[input01, input02]) # Get the Guids for us to work with guids = [v for v in results["guidAssignments"].values()] try: one_entity_multi_class = client.classify_entity( guid=guids[0], classifications=[ AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH"), AtlasClassification("MICROSOFT.PERSONAL.NAME") ], force_update=True) assert (one_entity_multi_class) multi_entity_single_class = client.classify_bulk_entities( entityGuids=guids, classification=AtlasClassification("MICROSOFT.PERSONAL.IPADDRESS")) assert (multi_entity_single_class) finally: for guid in guids: _ = client.delete_entity(guid)
def test_prepare_bulk_entity_from_atlas_entity(): class_entity = AtlasEntity( name=sample_entity["attributes"]["name"], typeName=sample_entity["typeName"], qualified_name=sample_entity["attributes"]["qualifiedName"], attributes=sample_entity["attributes"], guid=sample_entity["guid"], relationshipAttributes=sample_entity["relationshipAttributes"]) results = AtlasClient._prepare_entity_upload(class_entity) expected = {"entities": [sample_entity]} assert (results == expected)
def add_atlas_instance(self, version, instance_fqn): self.guid_count -= 1 self.instance = AtlasEntity(name=f'MySQL v.{version}', typeName='pyapacheatlas_mysql_instance', qualified_name=instance_fqn, guid=self.guid_count, attributes={ 'hostname': MYSQL_SERVER_HOSTNAME, 'port': MYSQL_INSTANCE_PORT, 'cloudOrOnPrem': MYSQL_SERVERINSTANCE_CLOUDORONPREM, 'contact_info': MYSQL_SERVERINSTANCE_CONTACTINFO, 'comment': MYSQL_SERVERINSTANCE_COMMENT })
def test_purview_search_iterates_safely(): ae = AtlasEntity(name="there_can_be_only_one", qualified_name="pyapacheatlas://there_can_be_only_one", guid=-100, typeName="hive_table") upload_success = client.upload_entities(ae) search_results = client.search_entities(r"custom_type_entity") counter = 0 for page in search_results: for entity in page: len(entity["id"]) counter = counter + 1 assert (counter == 1)
def add_atlas_column(self, column, table_fqn): self.guid_count -= 1 column_fqn = f'{table_fqn}#{column.Field}' table_column = AtlasEntity( name=column.Field, typeName='pyapacheatlas_mysql_column', qualified_name=column_fqn, guid=self.guid_count, attributes={ 'comment': self._decode_field_if_bytes(column.Comment), 'data_type': self._decode_field_if_bytes(column.Type), 'default_value': self._decode_field_if_bytes(column.Default), 'isNullable': self._decode_field_if_bytes(column.Null), 'isPrimaryKey': self._decode_field_if_bytes(column.Key) != None, 'collation': self._decode_field_if_bytes(column.Collation) }) self.table_columns.append(table_column) return table_column
def test_set_relationship_different_ways(): ae = AtlasEntity("rel01","hive_table", "tests://rel01", guid=-1) c1 = AtlasEntity("rel01#01", "hive_column", "tests://rel01#c", guid=-2, attributes={"type":"str"}) c2 = AtlasEntity("rel01#02", "hive_column", "tests://rel02#c", guid=-3, attributes={"type":"str"}) c3 = AtlasEntity("rel01#03", "hive_column", "tests://rel03#c", guid=-4, attributes={"type":"str"}) c4 = AtlasEntity("rel01#04", "hive_column", "tests://rel04#c", guid=-5, attributes={"type":"str"}) # Add c1 as the only relationship ae.addRelationship(columns=[c1.to_json(minimum=True)]) c2.relationshipAttributes.update({"table": ae.to_json(minimum=True) }) c3.addRelationship(table = ae) assignments = client.upload_entities([ae, c1, c2, c3, c4])["guidAssignments"] try: live_table = client.get_entity(guid=assignments["-1"])["entities"][0] # Should have two attributes because one is from the table having the # relationship defined as an array of columns and the second two from # the column's having the table relationshipAttribute defined on them. assert(len(live_table["relationshipAttributes"]["columns"]) == 3) relationship = { "typeName": "hive_table_columns", "attributes": {}, "guid": -100, # Ends are either guid or guid + typeName # (in case there are ambiguities?) "end1": { "guid": assignments["-1"] }, "end2": { "guid": assignments["-5"] } } relation_upload = client.upload_relationship(relationship) # Check that we have one more relationship # There are caching issues here :-( live_table_post_relationship = client.get_entity(guid=assignments["-1"])["entities"][0] assert(len(live_table["relationshipAttributes"]["columns"]) == 4) finally: # Need to delete all columns BEFORE you delete the table for local_id in [str(s) for s in range(-5,0)]: guid = assignments[local_id] _ = client.delete_entity(guid)
# Create a relationship between the process and steps relationship = RelationshipTypeDef( name="process_with_steps_steps", relationshipCategory="COMPOSITION", # Use the Parent/Child standard end definitions # "steps" will be an attribute on the process_with_steps entities # it will contain a list of step_in_process and display on the schema. # "parent_process" will be an attribute on the step_in_process entity. endDef1=ParentEndDef(name="steps", typeName="process_with_steps"), endDef2=ChildEndDef(name="parent_process", typeName="step_in_process")) # Create the process, steps in the process, and dummy inputs and outputs # for the lineage visualization step01 = AtlasEntity( name="Step01: Do something", qualified_name="process_xyz#step01", typeName="step_in_process", guid=-1000, description="This is the first step in which we do something.") step02 = AtlasEntity( name="Step02: Modify the data", qualified_name="process_xyz#step02", typeName="step_in_process", guid=-1001, description="This is the second step in which modify things.") step03 = AtlasEntity( name="Step03: Finalize the data", qualified_name="process_xyz#step03", typeName="step_in_process", guid=-1002, description="This is the third step in which we finalize things.")
and hive_columns defined on each end. However, this is the slowest path as it can only take one upload at a time whereas entity uploads can be many entities at a time. """ # Authenticate against your Atlas server oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "")) client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth) # Creating the entities that will be used in uploads. # One table will be added table = AtlasEntity("rel10", "hive_table", "tests://rel10", guid=-1) # Four columns will be added c1 = AtlasEntity("rel10#01", "hive_column", "tests://rel10#c", guid=-2, attributes={"type": "str"}) c2 = AtlasEntity("rel10#02", "hive_column", "tests://rel02#c", guid=-3, attributes={"type": "str"}) c3 = AtlasEntity("rel10#03", "hive_column", "tests://rel03#c", guid=-4,
Lastly, you can always upload an individual relationship with hive_table and hive_columns defined on each end. However, this is the slowest path as it can only take one upload at a time whereas entity uploads can be many entities at a time. """ # Authenticate against your Atlas server oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "")) client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth) # Creating the entities that will be used in uploads. table = AtlasEntity("rel01", "hive_table", "tests://rel01", guid=-1) c1 = AtlasEntity("rel01#01", "hive_column", "tests://rel01#c", guid=-2, attributes={"type": "str"}) c2 = AtlasEntity("rel01#02", "hive_column", "tests://rel02#c", guid=-3, attributes={"type": "str"}) c3 = AtlasEntity("rel01#03", "hive_column", "tests://rel03#c", guid=-4, attributes={"type": "str"})
display(df) # COMMAND ---------- # Now we begin to do some Atlas uploads using the types created above. # Get the notebook path as it will be part of our process' name. notebook_path = dbutils.notebook.entry_point.getDbutils().notebook( ).getContext().notebookPath().get() # COMMAND ---------- # Create an asset for the input data frame. atlas_input_df = AtlasEntity( name="demo_dbfs_delays_data", qualified_name="pyapacheatlas://demo_dbfs_delays_data", typeName="custom_spark_dataframe", guid=guid.get_guid(), ) # Create a process that represents our notebook and has our input # dataframe as one of the inputs. process = AtlasProcess( name="demo_cluster" + notebook_path, qualified_name="pyapacheatlas://demo_cluster" + notebook_path, typeName="custom_spark_job_process", guid=guid.get_guid(), attributes={"job_type": "notebook"}, inputs=[atlas_input_df], outputs=[ ] # No outputs for this demo, but otherwise, repeat what you did you the input dataframe. )
import json import os from pyapacheatlas.auth import BasicAuthentication, ServicePrincipalAuthentication from pyapacheatlas.core import AtlasClient, AtlasEntity, AtlasProcess, PurviewClient from pyapacheatlas.readers import ExcelConfiguration, ExcelReader ae_in = AtlasEntity("test_in", "hive_table", "test://lineage_hive_in", -101) ae_out = AtlasEntity("test_out", "hive_table", "test://lineage_hive_out", -102) proc = AtlasProcess("test_proc", "Process", "test://lineage_hive_out", guid=-103, inputs=[ae_in], outputs=[ae_out] ) LINEAGE_BATCH = [ae_in, ae_out, proc] auth = BasicAuthentication(username="******", password="******") client = AtlasClient(endpoint_url="http://localhost:21000/api/atlas/v2", authentication=auth) oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "") ) purview_client = PurviewClient( account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth ) def test_lineage_atlas():
tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "") ) client = PurviewClient( account_name = os.environ.get("PURVIEW_NAME", ""), authentication=oauth ) # Create an entity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity( name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) input02 = AtlasEntity( name="input02", typeName="DataSet", qualified_name="pyapacheatlas://demoinput02", guid=-101 ) results = client.upload_entities( batch=[input01.to_json(), input02.to_json()] ) # Get the Guids for us to work with guids = [v for v in results["guidAssignments"].values()]
""" # Authenticate against your Atlas server oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "")) client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME", ""), authentication=oauth) # Create two entities with AtlasEntity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity(name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100) output01 = AtlasEntity(name="output01", typeName="DataSet", qualified_name="pyapacheatlas://demooutput01", guid=-101) # The Atlas Process is the lineage component that links the two # entities together. The inputs and outputs need to be the "header" # version of the atlas entities, so specify minimum = True to # return just guid, qualifiedName, and typeName. process = AtlasProcess(name="sample process", typeName="Process", qualified_name="pyapacheatlas://democustomprocess", inputs=[input01], outputs=[output01],
"isLegacyAttribute": False }) # Upload the results upload_results = client.upload_typedefs( entityDefs=[column_entity_def, table_entity_def], relationshipDefs=[table_column_relationship], force_update=True) # With all the types and relationships defined, we can create entities. # We can use a GuidTracker to always get a unique negative number gt = GuidTracker() table_entity = AtlasEntity( name="sample_table", qualified_name="pyapacheatlas://sample_tablepyapacheatlas_custom_type", typeName="pyapacheatlas_demo_table", guid=gt.get_guid()) # Add two columns. They must include the "relationshipAttribute" attribute. column01 = AtlasEntity( name="column01", typeName="pyapacheatlas_demo_column", qualified_name= "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column01", attributes={ "data_type": "string", "description": "This is the first column." }, guid=gt.get_guid()) column02 = AtlasEntity(
def test_batches_entities_dependent(): gt = GuidTracker() a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid()) b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid()) b.addRelationship(table=a) c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid()) d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid()) c.addRelationship(parent=b) d.addRelationship(parent=b) e = AtlasEntity("E", "DataSet", "E", guid=gt.get_guid()) e.addRelationship(table=a) f = AtlasEntity("F", "DataSet", "F", guid=gt.get_guid()) g = AtlasEntity("G", "DataSet", "G", guid=gt.get_guid()) g.addRelationship(table=f) h = AtlasEntity("H", "DataSet", "H", guid=gt.get_guid()) h.addRelationship(parent=g) # Intentionally out of order j = AtlasEntity("J", "DataSet", "J", guid=gt.get_guid()) k = AtlasEntity("K", "DataSet", "K", guid=gt.get_guid()) i = AtlasEntity("I", "DataSet", "I", guid=gt.get_guid()) i.addRelationship(colA=j) i.addRelationship(colB=k) l = AtlasEntity("L", "DataSet", "L", guid=gt.get_guid()) m = AtlasEntity("M", "DataSet", "M", guid=gt.get_guid()) n = AtlasEntity("N", "DataSet", "N", guid=gt.get_guid()) o = AtlasEntity("O", "DataSet", "O", guid=gt.get_guid()) p = AtlasEntity("P", "DataSet", "P", guid=gt.get_guid()) entities = [ x.to_json() for x in [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p] ] results = batch_dependent_entities(entities, batch_size=7) # There are sixteen results, batch size of 7 means at least three groups # One group has seven connected # One group should have only three # All others are independent assert (len(results) == 3)
tenant_id=os.environ.get("TENANT_ID", ""), client_id=os.environ.get("CLIENT_ID", ""), client_secret=os.environ.get("CLIENT_SECRET", "") ) client = PurviewClient( account_name = os.environ.get("PURVIEW_NAME", ""), authentication=oauth ) # Create two entities with AtlasEntity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity( name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) output01 = AtlasEntity( name="output01", typeName="DataSet", qualified_name="pyapacheatlas://demooutput01", guid=-101 ) # The Atlas Process is the lineage component that links the two # entities together. The inputs and outputs need to be the "header" # version of the atlas entities, so specify minimum = True to # return just guid, qualifiedName, and typeName. process = AtlasProcess( name="sample process",
def test_whatif_validation(): expected = { "counts": { "TypeDoesNotExist": 1, "UsingInvalidAttributes": 1, "MissingRequiredAttributes": 1 }, "total": 3, "values": { "TypeDoesNotExist": [-101], "UsingInvalidAttributes": { -100: {"foo"} }, "MissingRequiredAttributes": { -98: {"req_attrib"} } } } entities = [ # Valid attribute AtlasEntity("dummy1", "demo_table", "dummy1", -99, attributes={ "req_attrib": "1" }).to_json(), # Missing attribute AtlasEntity("dummy10", "demo_table", "dummy10", -98, attributes={}).to_json(), # Non-Required attribute AtlasEntity("dummy20", "demo_table", "dummy20", -100, attributes={ "foo": "bar", "req_attrib": "abc" }).to_json(), # Bad Type AtlasEntity("dummy30", "bad_table", "dummy30", -101, attributes={ "foo": "bar" }).to_json() ] demo_table_type = { "entityDefs": [{ 'category': 'ENTITY', 'name': 'demo_table', 'attributeDefs': [ { "name": "req_attrib", "isOptional": False }, { "name": "name", "isOptional": False }, { "name": "qualifiedName", "isOptional": False }, ], 'relationshipAttributeDefs': [], 'superTypes': ['DataSet'] }] } local_what_if = WhatIfValidator(demo_table_type) results = local_what_if.validate_entities(entities) assert (set(local_what_if.entity_required_fields["demo_table"]) == set( ["req_attrib", "name", "qualifiedName"])) assert (results == expected)