def save_entities(atlas_mysql): oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get('AZURE_TENANT_ID', ''), client_id=os.environ.get('AZURE_CLIENT_ID', ''), client_secret=os.environ.get('AZURE_CLIENT_SECRET', '')) client = PurviewClient(account_name=os.environ.get('PURVIEW_CATALOG_NAME', ''), authentication=oauth) entities = [] entities.append(atlas_mysql.instance) for db in atlas_mysql.dbs: entities.append(db) for table in atlas_mysql.db_tables: entities.append(table) for column in atlas_mysql.table_columns: entities.append(column) assignments = client.upload_entities(entities)['guidAssignments'] f = open(f"entities.{time.time()}.txt", "a") for guid in assignments: f.write(assignments[guid] + "\n") f.close()
# being uploaded. input01 = AtlasEntity( name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) input02 = AtlasEntity( name="input02", typeName="DataSet", qualified_name="pyapacheatlas://demoinput02", guid=-101 ) results = client.upload_entities( batch=[input01.to_json(), input02.to_json()] ) # Get the Guids for us to work with guids = [v for v in results["guidAssignments"].values()] # Classify one entity with multiple classifications print(f"Adding multiple classifications to guid: {guids[0]}") one_entity_multi_class = client.classify_entity( guid=guids[0], classifications=[ AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH").to_json(), AtlasClassification("MICROSOFT.PERSONAL.NAME").to_json() ], force_update=True )
"description": "This is the first column." }, guid=gt.get_guid()) column02 = AtlasEntity( name="column02", typeName="pyapacheatlas_demo_column", qualified_name= "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column02", attributes={ "data_type": "int", "description": "This is the second column." }, guid=gt.get_guid()) # Add the "table" relationship attribute to your column entities column01.addRelationship(table=table_entity) column02.addRelationship(table=table_entity) # Do the upload and view the entities in the UI upload_results = client.upload_entities( batch=[table_entity, column01, column02]) print(json.dumps(upload_results, indent=2)) # To remove, delete the entity created and then the entity type. # client.delete_entity(guid=["..."]) # delete_relationship = client.delete_type("pyapacheatlas_table_column_relationship") # delete_results = client.delete_type("pyapacheatlas_demo_table") # delete_results = client.delete_type("pyapacheatlas_demo_column") # print(json.dumps(delete_results, indent=2))
typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100 ) output01 = AtlasEntity( name="output01", typeName="DataSet", qualified_name="pyapacheatlas://demooutput01", guid=-101 ) # The Atlas Process is the lineage component that links the two # entities together. The inputs and outputs need to be the "header" # version of the atlas entities, so specify minimum = True to # return just guid, qualifiedName, and typeName. process = AtlasProcess( name="sample process", typeName="Process", qualified_name="pyapacheatlas://democustomprocess", inputs=[input01.to_json(minimum=True)], outputs=[output01.to_json(minimum=True)], guid=-102 ) # Convert the individual entities into json before uploading. results = client.upload_entities( batch=[output01.to_json(), input01.to_json(), process.to_json()] ) print(json.dumps(results, indent=2))
"tests://rel03#c", guid=-4, attributes={"type": "str"}) c4 = AtlasEntity("rel01#04", "hive_column", "tests://rel04#c", guid=-5, attributes={"type": "str"}) # Add c1 as the only relationship to the table table.addRelationship(columns=[c1.to_json(minimum=True)]) c2.relationshipAttributes.update({"table": table.to_json(minimum=True)}) c3.addRelationship(table=table) assignments = client.upload_entities([table, c1, c2, c3, c4])["guidAssignments"] try: live_table = client.get_entity(guid=assignments["-1"])["entities"][0] # Should have two attributes because one is from the table having the # relationship defined as an array of columns and the second two from # the column's having the table relationshipAttribute defined on them. print("Here's what the upload looks like!") print(json.dumps(live_table["relationshipAttributes"], indent=2)) print("Now we are creating a relationship.") relationship = { # When creating manually, you have to "know" the typeName # and the types of each end. "typeName": "hive_table_columns",
authentication=oauth) # Create two entities with AtlasEntity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity(name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinput01", guid=-100) output01 = AtlasEntity(name="output01", typeName="DataSet", qualified_name="pyapacheatlas://demooutput01", guid=-101) # The Atlas Process is the lineage component that links the two # entities together. The inputs and outputs need to be the "header" # version of the atlas entities, so specify minimum = True to # return just guid, qualifiedName, and typeName. process = AtlasProcess(name="sample process", typeName="Process", qualified_name="pyapacheatlas://democustomprocess", inputs=[input01], outputs=[output01], guid=-102) # Convert the individual entities into json before uploading. results = client.upload_entities(batch=[output01, input01, process]) print(json.dumps(results, indent=2))
# we get to parsing the spreadsheet so we have something to work with. # This is not necessary if you are working with existing entities. inputTable = AtlasEntity( name="demo_hive_source", typeName="hive_table", qualified_name="pyapacheatlas://demo_update_lineage_input", guid=-100) outputTable = AtlasEntity( name="demo_hive_target", typeName="hive_table", qualified_name="pyapacheatlas://demo_update_lineage_output", guid=-101) # Upload these entities so we have something to work with # This will throw and exception if something goes wrong, otherwise # throw out the resulting json. _ = client.upload_entities([inputTable.to_json(), outputTable.to_json()]) # Create an empty excel template to be populated excel_reader.make_template(file_path) # This is just a helper to fill in some demo data fill_in_workbook(file_path, excel_config) # ACTUAL WORK: This parses our excel file and creates a batch to upload lineage_processes = excel_reader.parse_update_lineage(file_path) # This is what is getting sent to your Atlas server # print(json.dumps(lineage_processes,indent=2)) results = client.upload_entities(lineage_processes) print(json.dumps(results, indent=2))
# Create an entity # You must provide a name, typeName, qualified_name, and guid # the guid must be a negative number and unique in your batch # being uploaded. input01 = AtlasEntity( name="input01", typeName="DataSet", qualified_name="pyapacheatlas://demoinputclassification01", guid=-100) input02 = AtlasEntity( name="input02", typeName="DataSet", qualified_name="pyapacheatlas://demoinputclassification02", guid=-101) results = client.upload_entities(batch=[input01, input02]) # Get the Guids for us to work with guids = [v for v in results["guidAssignments"].values()] # Classify one entity with multiple classifications print(f"Adding multiple classifications to guid: {guids[0]}") one_entity_multi_class = client.classify_entity( guid=guids[0], classifications=[ AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH"), AtlasClassification("MICROSOFT.PERSONAL.NAME") ], force_update=True) print(json.dumps(one_entity_multi_class, indent=2))
# MAGIC %md # MAGIC ##### 6. Upload Notebook mapping into Purview # COMMAND ---------- maps = spark.read.option("header","true").csv("/mnt/datafiles/purview/notebook_mapping.csv") for map in maps.rdd.collect(): nbname = map.notebook.split('/')[-1] print("Adding :"+nbname) InputEntity = client.get_entity( qualifiedName=[map.source], typeName= 'azure_datalake_gen2_path' ) OutputEntity = client.get_entity( qualifiedName=[map.target], typeName="databricks_table" ) job_process = AtlasProcess( name=nbname, qualified_name = "databricks://"+v_databricks_domain+"/notebooks/"+nbname, typeName="databricks_job", guid=guid.get_guid(), attributes = {"job_type":"notebook","notebook_path":map.notebook}, inputs = [InputEntity.get("entities")[0]], outputs = [OutputEntity.get("entities")[0]] ) client.upload_entities(job_process)
# SETUP: This is just setting up the excel file for you file_path = "./demo_custom_type_and_entity_upload.xlsx" excel_config = ExcelConfiguration() excel_reader = ExcelReader(excel_config) # Create an empty excel template to be populated excel_reader.make_template(file_path) # This is just a helper to fill in some demo data fill_in_type_workbook(file_path, excel_config) fill_in_entity_workbook(file_path, excel_config) # ACTUAL WORK: This parses our excel file and creates a batch to upload typedefs = excel_reader.parse_entity_defs(file_path) entities = excel_reader.parse_bulk_entities(file_path) # This is what is getting sent to your Atlas server # print(json.dumps(typedefs,indent=2)) # print(json.dumps(entities,indent=2)) type_results = client.upload_typedefs(typedefs, force_update=True) entity_results = client.upload_entities(entities) print(json.dumps(type_results, indent=2)) print("\n") print(json.dumps(entity_results, indent=2)) print( "Completed type and bulk upload successfully!\nSearch for exampledataset to see your results." )
attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded as json/dicts batch = [ step01.to_json(), step02.to_json(), step03.to_json(), parent.to_json(), input01.to_json(), output01.to_json() ] # Upload the types typeResults = client.upload_typedefs( { "entityDefs": [processWithSteps.to_json(), processSteps.to_json()], "relationshipDefs": [relationship.to_json()] }, force_update=True) # Upload the entities results = client.upload_entities({"entities": batch}) # Print the results of the entities upload print(json.dumps(results, indent=2)) print("Successfully created types and entities!")
{"Source": "*", "Sink": "Out01UniqueField3"}, {"Source": "*", "Sink": "Out01UniqueField4"}], "DatasetMapping": {"Source":"*","Sink": colMapOutput01.qualifiedName} }, # This is another example of the above special case for an input object {"ColumnMapping": [ {"Source": "*", "Sink": "In01UniqueField"}, {"Source": "*", "Sink": "In01UniqueField2"}], "DatasetMapping": {"Source": "*", "Sink": colMapInput01.qualifiedName} } ] # Create the process with the stringified column mapping json. process = AtlasProcess( name="test process", typeName="ProcessWithColumnMapping", qualified_name="pyapacheatlas://colMapOutputProcessDemo", inputs=[colMapInput01, colMapInput02], outputs=[colMapOutput01], guid=gt.get_guid(), attributes={"columnMapping": json.dumps(column_mapping)} ) results = client.upload_entities( [process, colMapInput01, colMapInput02, colMapOutput01] ) print(json.dumps(results, indent=2)) sink_guid = results["guidAssignments"][str(colMapOutput01.guid)] print(f"Search for \"{colMapOutput01.name}\" or use the guid {sink_guid} to look up the sink table.")
name="sample_process_xyz", typeName="Process", qualified_name="pyapacheatlas://democustomprocess", inputs=None, # Set to None so no update will occur outputs=None, # We will update this with .outputs below guid=-104) real_existing_process = client.get_entity( typeName="Process", qualifiedName="pyapacheatlas://democustomprocess")["entities"][0] print("Working with process guid: {}".format( real_existing_process["guid"])) # Get the list of existing outputs from the attributes. existing_outputs = real_existing_process["attributes"]["outputs"] # Create one more output to be added. one_more_output = AtlasEntity( name="output_added_later", typeName="DataSet", qualified_name="pyapacheatlas://demooutput04", guid=-103) # Add the existing and new output to the dummy process dummy_existing_process.outputs = existing_outputs + [one_more_output] complex_results = client.upload_entities( batch=[dummy_existing_process, one_more_output]) print(json.dumps(complex_results, indent=2))
outputs=[ ] # No outputs for this demo, but otherwise, repeat what you did you the input dataframe. ) # Iterate over the input data frame's columns and create them. # Note: This is an add, not a delete. If the dataframe already exists in # Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan' # columns. They will continue to exist and point to the dataframe. atlas_input_df_columns = [] for column in df.schema: temp_column = AtlasEntity( name=column.name, typeName="custom_spark_dataframe_column", qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name, guid=guid.get_guid(), attributes={"data_type": str(column.dataType)}, relationshipAttributes={ "dataframe": atlas_input_df.to_json(minimum=True) }) atlas_input_df_columns.append(temp_column) # COMMAND ---------- # Prepare all the entities as a batch to be uploaded. batch = [process, atlas_input_df] + atlas_input_df_columns # COMMAND ---------- # Upload all entities! client.upload_entities(batch=batch)
atlas_type_defs, use_column_mapping=True) print("Results from excel transformation") print(json.dumps(excel_results, indent=2)) input(">>>>Review the above results to see what your excel file contained") # Validate What IF whatif = WhatIfValidator(type_defs=atlas_type_defs) report = whatif.validate_entities(excel_results) if report["total"] > 0: print("There were errors in the provided typedefs") print(report) exit(1) else: print("There were no errors in the excel file") input( ">>>>Review the what-if validation results above and get ready to upload your entities!" ) # Upload excel file's content to Atlas and view the guid assignments to confirm successful upload uploaded_entities = client.upload_entities(excel_results) print(json.dumps(uploaded_entities, indent=2)) print("Completed uploads of demo!") # Be sure to clean up the excel file stored in file_path
qualified_name="process_xyz", typeName="process_with_steps", guid=-1003, relationshipAttributes={ "steps": [ step01.to_json(minimum=True), step02.to_json(minimum=True), step03.to_json(minimum=True), ] }, attributes={ "inputs": [input01.to_json(minimum=True)], "outputs": [output01.to_json(minimum=True)] }) # Create a batch of entities to be uploaded to Purview batch = [step01, step02, step03, parent, input01, output01] # Upload the types typeResults = client.upload_typedefs( entityDefs=[processWithSteps, processSteps], relationshipDefs=[relationship], force_update=True) # Upload the entities results = client.upload_entities(batch) # Print the results of the entities upload print(json.dumps(results, indent=2)) print("Successfully created types and entities!")
results = client.upload_typedefs(entityDefs=[edef], force_update=True) # Just for demonstration purposes, get the entity type def. get_results = client.get_typedef( TypeCategory.ENTITY, name="pyapacheatlas_create_type_def_sample") print("# Results from getting created type def:") print(json.dumps(get_results, indent=2)) # Creating an instance of this custom type actual_entity = AtlasEntity( name="instance_of_pyapacheatlas_create_type_def_sample", qualified_name= "pyapacheatlas://instance_of_pyapacheatlas_create_type_def_sample", typeName="pyapacheatlas_create_type_def_sample", attributes={ "column1": "abc", "column2": 123, "column3": ["a", "b"] }, guid=-100) upload_results = client.upload_entities(actual_entity) print("# Results of entity upload:") print(json.dumps(upload_results, indent=2)) # To remove, delete the entity created and then the entity type. # client.delete_entity(guid=["..."]) # delete_results = client.delete_type("pyapacheatlas_create_type_def_sample") # print(json.dumps(delete_results, indent=2))