Example #1
0
def save_entities(atlas_mysql):
    oauth = ServicePrincipalAuthentication(
        tenant_id=os.environ.get('AZURE_TENANT_ID', ''),
        client_id=os.environ.get('AZURE_CLIENT_ID', ''),
        client_secret=os.environ.get('AZURE_CLIENT_SECRET', ''))
    client = PurviewClient(account_name=os.environ.get('PURVIEW_CATALOG_NAME',
                                                       ''),
                           authentication=oauth)
    entities = []
    entities.append(atlas_mysql.instance)
    for db in atlas_mysql.dbs:
        entities.append(db)
    for table in atlas_mysql.db_tables:
        entities.append(table)
    for column in atlas_mysql.table_columns:
        entities.append(column)

    assignments = client.upload_entities(entities)['guidAssignments']
    f = open(f"entities.{time.time()}.txt", "a")
    for guid in assignments:
        f.write(assignments[guid] + "\n")
    f.close()
Example #2
0
    # being uploaded.
    input01 = AtlasEntity(
        name="input01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput01",
        guid=-100
    )
    input02 = AtlasEntity(
        name="input02",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput02",
        guid=-101
    )

    results = client.upload_entities(
        batch=[input01.to_json(), input02.to_json()]
    )

    # Get the Guids for us to work with
    guids = [v for v in results["guidAssignments"].values()]

    # Classify one entity with multiple classifications
    print(f"Adding multiple classifications to guid: {guids[0]}")
    one_entity_multi_class = client.classify_entity(
        guid=guids[0], 
        classifications=[
            AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH").to_json(),
            AtlasClassification("MICROSOFT.PERSONAL.NAME").to_json()
            ],
        force_update=True
    )
            "description": "This is the first column."
        },
        guid=gt.get_guid())
    column02 = AtlasEntity(
        name="column02",
        typeName="pyapacheatlas_demo_column",
        qualified_name=
        "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column02",
        attributes={
            "data_type": "int",
            "description": "This is the second column."
        },
        guid=gt.get_guid())

    # Add the "table" relationship attribute to your column entities
    column01.addRelationship(table=table_entity)
    column02.addRelationship(table=table_entity)

    # Do the upload and view the entities in the UI
    upload_results = client.upload_entities(
        batch=[table_entity, column01, column02])

    print(json.dumps(upload_results, indent=2))

    # To remove, delete the entity created and then the entity type.
    # client.delete_entity(guid=["..."])
    # delete_relationship = client.delete_type("pyapacheatlas_table_column_relationship")
    # delete_results = client.delete_type("pyapacheatlas_demo_table")
    # delete_results = client.delete_type("pyapacheatlas_demo_column")
    # print(json.dumps(delete_results, indent=2))
Example #4
0
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput01",
        guid=-100
    )
    output01 = AtlasEntity(
        name="output01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demooutput01",
        guid=-101
    )

    # The Atlas Process is the lineage component that links the two
    # entities together. The inputs and outputs need to be the "header"
    # version of the atlas entities, so specify minimum = True to
    # return just guid, qualifiedName, and typeName.
    process = AtlasProcess(
        name="sample process",
        typeName="Process",
        qualified_name="pyapacheatlas://democustomprocess",
        inputs=[input01.to_json(minimum=True)],
        outputs=[output01.to_json(minimum=True)],
        guid=-102
    )

    # Convert the individual entities into json before uploading.
    results = client.upload_entities(
        batch=[output01.to_json(), input01.to_json(), process.to_json()]
    )

    print(json.dumps(results, indent=2))
Example #5
0
                     "tests://rel03#c",
                     guid=-4,
                     attributes={"type": "str"})
    c4 = AtlasEntity("rel01#04",
                     "hive_column",
                     "tests://rel04#c",
                     guid=-5,
                     attributes={"type": "str"})

    # Add c1 as the only relationship to the table
    table.addRelationship(columns=[c1.to_json(minimum=True)])

    c2.relationshipAttributes.update({"table": table.to_json(minimum=True)})
    c3.addRelationship(table=table)

    assignments = client.upload_entities([table, c1, c2, c3,
                                          c4])["guidAssignments"]

    try:
        live_table = client.get_entity(guid=assignments["-1"])["entities"][0]

        # Should have two attributes because one is from the table having the
        # relationship defined as an array of columns and the second two from
        # the column's having the table relationshipAttribute defined on them.
        print("Here's what the upload looks like!")
        print(json.dumps(live_table["relationshipAttributes"], indent=2))
        print("Now we are creating a relationship.")

        relationship = {
            # When creating manually, you have to "know" the typeName
            # and the types of each end.
            "typeName": "hive_table_columns",
Example #6
0
                           authentication=oauth)

    # Create two entities with AtlasEntity
    # You must provide a name, typeName, qualified_name, and guid
    # the guid must be a negative number and unique in your batch
    # being uploaded.
    input01 = AtlasEntity(name="input01",
                          typeName="DataSet",
                          qualified_name="pyapacheatlas://demoinput01",
                          guid=-100)
    output01 = AtlasEntity(name="output01",
                           typeName="DataSet",
                           qualified_name="pyapacheatlas://demooutput01",
                           guid=-101)

    # The Atlas Process is the lineage component that links the two
    # entities together. The inputs and outputs need to be the "header"
    # version of the atlas entities, so specify minimum = True to
    # return just guid, qualifiedName, and typeName.
    process = AtlasProcess(name="sample process",
                           typeName="Process",
                           qualified_name="pyapacheatlas://democustomprocess",
                           inputs=[input01],
                           outputs=[output01],
                           guid=-102)

    # Convert the individual entities into json before uploading.
    results = client.upload_entities(batch=[output01, input01, process])

    print(json.dumps(results, indent=2))
    # we get to parsing the spreadsheet so we have something to work with.
    # This is not necessary if you are working with existing entities.
    inputTable = AtlasEntity(
        name="demo_hive_source",
        typeName="hive_table",
        qualified_name="pyapacheatlas://demo_update_lineage_input",
        guid=-100)
    outputTable = AtlasEntity(
        name="demo_hive_target",
        typeName="hive_table",
        qualified_name="pyapacheatlas://demo_update_lineage_output",
        guid=-101)
    # Upload these entities so we have something to work with
    # This will throw and exception if something goes wrong, otherwise
    # throw out the resulting json.
    _ = client.upload_entities([inputTable.to_json(), outputTable.to_json()])

    # Create an empty excel template to be populated
    excel_reader.make_template(file_path)
    # This is just a helper to fill in some demo data
    fill_in_workbook(file_path, excel_config)

    # ACTUAL WORK: This parses our excel file and creates a batch to upload
    lineage_processes = excel_reader.parse_update_lineage(file_path)

    # This is what is getting sent to your Atlas server
    # print(json.dumps(lineage_processes,indent=2))

    results = client.upload_entities(lineage_processes)

    print(json.dumps(results, indent=2))
Example #8
0
    # Create an entity
    # You must provide a name, typeName, qualified_name, and guid
    # the guid must be a negative number and unique in your batch
    # being uploaded.
    input01 = AtlasEntity(
        name="input01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinputclassification01",
        guid=-100)
    input02 = AtlasEntity(
        name="input02",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinputclassification02",
        guid=-101)

    results = client.upload_entities(batch=[input01, input02])

    # Get the Guids for us to work with
    guids = [v for v in results["guidAssignments"].values()]

    # Classify one entity with multiple classifications
    print(f"Adding multiple classifications to guid: {guids[0]}")
    one_entity_multi_class = client.classify_entity(
        guid=guids[0],
        classifications=[
            AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH"),
            AtlasClassification("MICROSOFT.PERSONAL.NAME")
        ],
        force_update=True)
    print(json.dumps(one_entity_multi_class, indent=2))
# MAGIC %md
# MAGIC ##### 6. Upload Notebook mapping into Purview

# COMMAND ----------

maps = spark.read.option("header","true").csv("/mnt/datafiles/purview/notebook_mapping.csv")
for map in maps.rdd.collect():
  nbname = map.notebook.split('/')[-1]
  print("Adding :"+nbname)
  InputEntity = client.get_entity(
        qualifiedName=[map.source],
        typeName= 'azure_datalake_gen2_path'
    )
  OutputEntity = client.get_entity(
        qualifiedName=[map.target],
        typeName="databricks_table"
    )
  job_process = AtlasProcess(
  name=nbname,
  qualified_name = "databricks://"+v_databricks_domain+"/notebooks/"+nbname,
  typeName="databricks_job",
  guid=guid.get_guid(),
  attributes = {"job_type":"notebook","notebook_path":map.notebook},
  inputs = [InputEntity.get("entities")[0]],
  outputs = [OutputEntity.get("entities")[0]] )

  client.upload_entities(job_process)

  
Example #10
0
    # SETUP: This is just setting up the excel file for you
    file_path = "./demo_custom_type_and_entity_upload.xlsx"
    excel_config = ExcelConfiguration()
    excel_reader = ExcelReader(excel_config)

    # Create an empty excel template to be populated
    excel_reader.make_template(file_path)
    # This is just a helper to fill in some demo data
    fill_in_type_workbook(file_path, excel_config)
    fill_in_entity_workbook(file_path, excel_config)

    # ACTUAL WORK: This parses our excel file and creates a batch to upload
    typedefs = excel_reader.parse_entity_defs(file_path)
    entities = excel_reader.parse_bulk_entities(file_path)

    # This is what is getting sent to your Atlas server
    # print(json.dumps(typedefs,indent=2))
    # print(json.dumps(entities,indent=2))

    type_results = client.upload_typedefs(typedefs, force_update=True)
    entity_results = client.upload_entities(entities)

    print(json.dumps(type_results, indent=2))
    print("\n")
    print(json.dumps(entity_results, indent=2))

    print(
        "Completed type and bulk upload successfully!\nSearch for exampledataset to see your results."
    )
                         attributes={
                             "inputs": [input01.to_json(minimum=True)],
                             "outputs": [output01.to_json(minimum=True)]
                         })

    # Create a batch of entities to be uploaded as json/dicts
    batch = [
        step01.to_json(),
        step02.to_json(),
        step03.to_json(),
        parent.to_json(),
        input01.to_json(),
        output01.to_json()
    ]

    # Upload the types
    typeResults = client.upload_typedefs(
        {
            "entityDefs": [processWithSteps.to_json(),
                           processSteps.to_json()],
            "relationshipDefs": [relationship.to_json()]
        },
        force_update=True)

    # Upload the entities
    results = client.upload_entities({"entities": batch})

    # Print the results of the entities upload
    print(json.dumps(results, indent=2))
    print("Successfully created types and entities!")
Example #12
0
            {"Source": "*", "Sink": "Out01UniqueField3"},
            {"Source": "*", "Sink": "Out01UniqueField4"}],
            "DatasetMapping": {"Source":"*","Sink": colMapOutput01.qualifiedName}
         },
         # This is another example of the above special case for an input object
         {"ColumnMapping": [
            {"Source": "*", "Sink": "In01UniqueField"},
            {"Source": "*", "Sink": "In01UniqueField2"}],
            "DatasetMapping": {"Source": "*", "Sink": colMapInput01.qualifiedName}
         }
    ]

    # Create the process with the stringified column mapping json.
    process = AtlasProcess(
        name="test process",
        typeName="ProcessWithColumnMapping",
        qualified_name="pyapacheatlas://colMapOutputProcessDemo",
        inputs=[colMapInput01, colMapInput02],
        outputs=[colMapOutput01],
        guid=gt.get_guid(),
        attributes={"columnMapping": json.dumps(column_mapping)}
    )

    results = client.upload_entities(
        [process, colMapInput01, colMapInput02, colMapOutput01]
    )

    print(json.dumps(results, indent=2))
    sink_guid = results["guidAssignments"][str(colMapOutput01.guid)]
    print(f"Search for \"{colMapOutput01.name}\" or use the guid {sink_guid} to look up the sink table.")
Example #13
0
        name="sample_process_xyz",
        typeName="Process",
        qualified_name="pyapacheatlas://democustomprocess",
        inputs=None,  # Set to None so no update will occur
        outputs=None,  # We will update this with .outputs below
        guid=-104)

    real_existing_process = client.get_entity(
        typeName="Process",
        qualifiedName="pyapacheatlas://democustomprocess")["entities"][0]
    print("Working with process guid: {}".format(
        real_existing_process["guid"]))

    # Get the list of existing outputs from the attributes.
    existing_outputs = real_existing_process["attributes"]["outputs"]

    # Create one more output to be added.
    one_more_output = AtlasEntity(
        name="output_added_later",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demooutput04",
        guid=-103)

    # Add the existing and new output to the dummy process
    dummy_existing_process.outputs = existing_outputs + [one_more_output]

    complex_results = client.upload_entities(
        batch=[dummy_existing_process, one_more_output])

    print(json.dumps(complex_results, indent=2))
Example #14
0
    outputs=[
    ]  # No outputs for this demo, but otherwise, repeat what you did you the input dataframe.
)

# Iterate over the input data frame's columns and create them.
# Note: This is an add, not a delete. If the dataframe already exists in
# Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan'
# columns. They will continue to exist and point to the dataframe.
atlas_input_df_columns = []
for column in df.schema:
    temp_column = AtlasEntity(
        name=column.name,
        typeName="custom_spark_dataframe_column",
        qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name,
        guid=guid.get_guid(),
        attributes={"data_type": str(column.dataType)},
        relationshipAttributes={
            "dataframe": atlas_input_df.to_json(minimum=True)
        })
    atlas_input_df_columns.append(temp_column)

# COMMAND ----------

# Prepare all the entities as a batch to be uploaded.
batch = [process, atlas_input_df] + atlas_input_df_columns

# COMMAND ----------

# Upload all entities!
client.upload_entities(batch=batch)
Example #15
0
                                                atlas_type_defs,
                                                use_column_mapping=True)

    print("Results from excel transformation")
    print(json.dumps(excel_results, indent=2))

    input(">>>>Review the above results to see what your excel file contained")

    # Validate What IF
    whatif = WhatIfValidator(type_defs=atlas_type_defs)

    report = whatif.validate_entities(excel_results)

    if report["total"] > 0:
        print("There were errors in the provided typedefs")
        print(report)
        exit(1)
    else:
        print("There were no errors in the excel file")

    input(
        ">>>>Review the what-if validation results above and get ready to upload your entities!"
    )

    # Upload excel file's content to Atlas and view the guid assignments to confirm successful upload
    uploaded_entities = client.upload_entities(excel_results)
    print(json.dumps(uploaded_entities, indent=2))

    print("Completed uploads of demo!")
    # Be sure to clean up the excel file stored in file_path
                         qualified_name="process_xyz",
                         typeName="process_with_steps",
                         guid=-1003,
                         relationshipAttributes={
                             "steps": [
                                 step01.to_json(minimum=True),
                                 step02.to_json(minimum=True),
                                 step03.to_json(minimum=True),
                             ]
                         },
                         attributes={
                             "inputs": [input01.to_json(minimum=True)],
                             "outputs": [output01.to_json(minimum=True)]
                         })

    # Create a batch of entities to be uploaded to Purview
    batch = [step01, step02, step03, parent, input01, output01]

    # Upload the types
    typeResults = client.upload_typedefs(
        entityDefs=[processWithSteps, processSteps],
        relationshipDefs=[relationship],
        force_update=True)

    # Upload the entities
    results = client.upload_entities(batch)

    # Print the results of the entities upload
    print(json.dumps(results, indent=2))
    print("Successfully created types and entities!")
    results = client.upload_typedefs(entityDefs=[edef], force_update=True)

    # Just for demonstration purposes, get the entity type def.
    get_results = client.get_typedef(
        TypeCategory.ENTITY, name="pyapacheatlas_create_type_def_sample")
    print("# Results from getting created type def:")
    print(json.dumps(get_results, indent=2))

    # Creating an instance of this custom type
    actual_entity = AtlasEntity(
        name="instance_of_pyapacheatlas_create_type_def_sample",
        qualified_name=
        "pyapacheatlas://instance_of_pyapacheatlas_create_type_def_sample",
        typeName="pyapacheatlas_create_type_def_sample",
        attributes={
            "column1": "abc",
            "column2": 123,
            "column3": ["a", "b"]
        },
        guid=-100)

    upload_results = client.upload_entities(actual_entity)

    print("# Results of entity upload:")
    print(json.dumps(upload_results, indent=2))

    # To remove, delete the entity created and then the entity type.
    # client.delete_entity(guid=["..."])
    # delete_results = client.delete_type("pyapacheatlas_create_type_def_sample")
    # print(json.dumps(delete_results, indent=2))