def test_set_relationship_different_ways(): ae = AtlasEntity("rel01","hive_table", "tests://rel01", guid=-1) c1 = AtlasEntity("rel01#01", "hive_column", "tests://rel01#c", guid=-2, attributes={"type":"str"}) c2 = AtlasEntity("rel01#02", "hive_column", "tests://rel02#c", guid=-3, attributes={"type":"str"}) c3 = AtlasEntity("rel01#03", "hive_column", "tests://rel03#c", guid=-4, attributes={"type":"str"}) c4 = AtlasEntity("rel01#04", "hive_column", "tests://rel04#c", guid=-5, attributes={"type":"str"}) # Add c1 as the only relationship ae.addRelationship(columns=[c1.to_json(minimum=True)]) c2.relationshipAttributes.update({"table": ae.to_json(minimum=True) }) c3.addRelationship(table = ae) assignments = client.upload_entities([ae, c1, c2, c3, c4])["guidAssignments"] try: live_table = client.get_entity(guid=assignments["-1"])["entities"][0] # Should have two attributes because one is from the table having the # relationship defined as an array of columns and the second two from # the column's having the table relationshipAttribute defined on them. assert(len(live_table["relationshipAttributes"]["columns"]) == 3) relationship = { "typeName": "hive_table_columns", "attributes": {}, "guid": -100, # Ends are either guid or guid + typeName # (in case there are ambiguities?) "end1": { "guid": assignments["-1"] }, "end2": { "guid": assignments["-5"] } } relation_upload = client.upload_relationship(relationship) # Check that we have one more relationship # There are caching issues here :-( live_table_post_relationship = client.get_entity(guid=assignments["-1"])["entities"][0] assert(len(live_table["relationshipAttributes"]["columns"]) == 4) finally: # Need to delete all columns BEFORE you delete the table for local_id in [str(s) for s in range(-5,0)]: guid = assignments[local_id] _ = client.delete_entity(guid)
def test_prepare_bulk_entity_from_mixed_atlas_entity_dict(): class_entity = AtlasEntity( name=sample_entity["attributes"]["name"], typeName=sample_entity["typeName"], qualified_name=sample_entity["attributes"]["qualifiedName"], attributes=sample_entity["attributes"], guid=sample_entity["guid"], relationshipAttributes=sample_entity["relationshipAttributes"]) class_entity2 = AtlasEntity( name=sample_entity["attributes"]["name"] + "abc", typeName=sample_entity["typeName"], qualified_name=sample_entity["attributes"]["qualifiedName"] + "abc", attributes=sample_entity["attributes"], guid=sample_entity["guid"], relationshipAttributes=sample_entity["relationshipAttributes"]) results = AtlasClient._prepare_entity_upload( [class_entity, class_entity2.to_json()]) sample2 = sample_entity.copy() sample2["attributes"]["name"] = sample2["attributes"]["name"] + "abc" sample2["attributes"][ "qualifiedName"] = sample2["attributes"]["qualifiedName"] + "abc" expected = {"entities": [sample_entity, sample2]} assert (results == expected)
input01 = AtlasEntity(name="demoinput01", qualified_name="demoinput01", guid=-5000, typeName="DataSet") output01 = AtlasEntity(name="demooutput01", qualified_name="demooutput01", guid=-5001, typeName="DataSet") parent = AtlasEntity(name="my_complex_workflow", qualified_name="process_xyz", typeName="process_with_steps", guid=-1003, relationshipAttributes={ "steps": [ step01.to_json(minimum=True), step02.to_json(minimum=True), step03.to_json(minimum=True), ] }, attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded to Purview batch = [step01, step02, step03, parent, input01, output01] # Upload the types typeResults = client.upload_typedefs( entityDefs=[processWithSteps, processSteps],
outputs=[ ] # No outputs for this demo, but otherwise, repeat what you did you the input dataframe. ) # Iterate over the input data frame's columns and create them. # Note: This is an add, not a delete. If the dataframe already exists in # Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan' # columns. They will continue to exist and point to the dataframe. atlas_input_df_columns = [] for column in df.schema: temp_column = AtlasEntity( name=column.name, typeName="custom_spark_dataframe_column", qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name, guid=guid.get_guid(), attributes={"data_type": str(column.dataType)}, relationshipAttributes={ "dataframe": atlas_input_df.to_json(minimum=True) }) atlas_input_df_columns.append(temp_column) # COMMAND ---------- # Prepare all the entities as a batch to be uploaded. batch = [process, atlas_input_df] + atlas_input_df_columns # COMMAND ---------- # Upload all entities! client.upload_entities(batch=batch)
# being uploaded. input01 = AtlasEntity( name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) input02 = AtlasEntity( name="input02", typeName="DataSet", qualified_name="pyapacheatlas://demoinput02", guid=-101 ) results = client.upload_entities( batch=[input01.to_json(), input02.to_json()] ) # Get the Guids for us to work with guids = [v for v in results["guidAssignments"].values()] # Classify one entity with multiple classifications print(f"Adding multiple classifications to guid: {guids[0]}") one_entity_multi_class = client.classify_entity( guid=guids[0], classifications=[ AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH").to_json(), AtlasClassification("MICROSOFT.PERSONAL.NAME").to_json() ], force_update=True )
typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) output01 = AtlasEntity( name="output01", typeName="DataSet", qualified_name="pyapacheatlas://demooutput01", guid=-101 ) # The Atlas Process is the lineage component that links the two # entities together. The inputs and outputs need to be the "header" # version of the atlas entities, so specify minimum = True to # return just guid, qualifiedName, and typeName. process = AtlasProcess( name="sample process", typeName="Process", qualified_name="pyapacheatlas://democustomprocess", inputs=[input01.to_json(minimum=True)], outputs=[output01.to_json(minimum=True)], guid=-102 ) # Convert the individual entities into json before uploading. results = client.upload_entities( batch=[output01.to_json(), input01.to_json(), process.to_json()] ) print(json.dumps(results, indent=2))
atlas_input_df = AtlasEntity( name="demo_dbfs_delays_data", qualified_name="pyapacheatlas://demo_dbfs_delays_data", typeName="custom_spark_dataframe", guid=guid.get_guid(), ) # Create a process that represents our notebook and has our input # dataframe as one of the inputs. process = AtlasProcess( name="demo_cluster" + notebook_path, qualified_name="pyapacheatlas://demo_cluster" + notebook_path, typeName="custom_spark_job_process", guid=guid.get_guid(), attributes={"job_type": "notebook"}, inputs=[atlas_input_df.to_json(minimum=True)], outputs=[ ] # No outputs for this demo, but otherwise, repeat what you did you the input dataframe. ) # Iterate over the input data frame's columns and create them. # Note: This is an add, not a delete. If the dataframe already exists in # Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan' # columns. They will continue to exist and point to the dataframe. atlas_input_df_columns = [] for column in df.schema: temp_column = AtlasEntity( name=column.name, typeName="custom_spark_dataframe_column", qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name, guid=guid.get_guid(),
"tests://rel02#c", guid=-3, attributes={"type": "str"}) c3 = AtlasEntity("rel01#03", "hive_column", "tests://rel03#c", guid=-4, attributes={"type": "str"}) c4 = AtlasEntity("rel01#04", "hive_column", "tests://rel04#c", guid=-5, attributes={"type": "str"}) # Add c1 as the only relationship to the table table.addRelationship(columns=[c1.to_json(minimum=True)]) c2.relationshipAttributes.update({"table": table.to_json(minimum=True)}) c3.addRelationship(table=table) assignments = client.upload_entities([table, c1, c2, c3, c4])["guidAssignments"] try: live_table = client.get_entity(guid=assignments["-1"])["entities"][0] # Should have two attributes because one is from the table having the # relationship defined as an array of columns and the second two from # the column's having the table relationshipAttribute defined on them. print("Here's what the upload looks like!") print(json.dumps(live_table["relationshipAttributes"], indent=2))
# we get to parsing the spreadsheet so we have something to work with. # This is not necessary if you are working with existing entities. inputTable = AtlasEntity( name="demo_hive_source", typeName="hive_table", qualified_name="pyapacheatlas://demo_update_lineage_input", guid=-100) outputTable = AtlasEntity( name="demo_hive_target", typeName="hive_table", qualified_name="pyapacheatlas://demo_update_lineage_output", guid=-101) # Upload these entities so we have something to work with # This will throw and exception if something goes wrong, otherwise # throw out the resulting json. _ = client.upload_entities([inputTable.to_json(), outputTable.to_json()]) # Create an empty excel template to be populated excel_reader.make_template(file_path) # This is just a helper to fill in some demo data fill_in_workbook(file_path, excel_config) # ACTUAL WORK: This parses our excel file and creates a batch to upload lineage_processes = excel_reader.parse_update_lineage(file_path) # This is what is getting sent to your Atlas server # print(json.dumps(lineage_processes,indent=2)) results = client.upload_entities(lineage_processes) print(json.dumps(results, indent=2))
input01 = AtlasEntity(name="demoinput01", qualified_name="demoinput01", guid=-5000, typeName="DataSet") output01 = AtlasEntity(name="demooutput01", qualified_name="demooutput01", guid=-5001, typeName="DataSet") parent = AtlasEntity(name="my_complex_workflow", qualified_name="process_xyz", typeName="process_with_steps", guid=-1003, relationshipAttributes={ "steps": [ step01.to_json(minimum=True), step02.to_json(minimum=True), step03.to_json(minimum=True), ] }, attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded as json/dicts batch = [ step01.to_json(), step02.to_json(), step03.to_json(), parent.to_json(),