Beispiel #1
0
def test_from_process_lookup_col_lineage():
    entities = [
        AtlasProcess(
            name="demo_process_name",
            typeName="demo_process",
            qualified_name="demo_process_qualifier",
            inputs=[],
            outputs=[],
            guid = -1002,
            relationshipAttributes=[{"name":"dummy"}, {"name":"columnLineages", "typeName":"array<demo_column_lineage>"}]
        ),
        AtlasProcess(
                name="demo_process_qualifier_no_in",
                typeName="demo_process1",
                qualified_name="demo_process_qualifier_no_in",
                inputs=[],
                outputs=[],
                guid = -1003
        ),
        AtlasProcess(
                name="demo_process_qualifier_no_out",
                typeName="demo_process2",
                qualified_name="demo_process_qualifier_no_out",
                inputs=[],
                outputs=[],
                guid = -1004
        )
    ]
    process_col_lineage = from_process_lookup_col_lineage("demo_process_name", entities, RELATIONSHIP_TYPE_DEFS)

    assert(process_col_lineage == "demo_column_lineage")
Beispiel #2
0
def setup_batch_entities():
    atlas_entities = [
          AtlasEntity(
              name="demoentity",
              typeName="demo_table",
              qualified_name="demoentity",
              guid = -1000
          ),
          AtlasEntity(
              name="demoentity2",
              typeName="demo2_table",
              qualified_name="demoentity2",
              guid = -1001
          )
      ]
    atlas_proc = AtlasProcess(
              name="demo_process_name",
              typeName="demo_process",
              qualified_name="demo_process_qualifier",
              inputs=[atlas_entities[0].to_json(minimum=True)],
              outputs=[atlas_entities[1].to_json(minimum=True)],
              guid = -1002
      )
    atlas_proc_no_in = AtlasProcess(
              name="demo_process_qualifier_no_in",
              typeName="demo_process1",
              qualified_name="demo_process_qualifier_no_in",
              inputs=[],
              outputs=[atlas_entities[1].to_json(minimum=True)],
              guid = -1003
      )
    atlas_proc_no_out = AtlasProcess(
              name="demo_process_qualifier_no_out",
              typeName="demo_process2",
              qualified_name="demo_process_qualifier_no_out",
              inputs=[atlas_entities[0].to_json(minimum=True)],
              outputs=[],
              guid = -1004
      )
    atlas_entities.extend([atlas_proc, atlas_proc_no_in, atlas_proc_no_out])
    return atlas_entities
Beispiel #3
0
def convert_Spline_to_Purview(splineJson):
    splineJson = json.loads(splineJson)

    # Get notebook info
    notebookInfo = splineJson["extraInfo"]["notebookInfo"]["obj"]
    notebookURL = notebookInfo["notebookURL"].replace("\\", "")

    guid = GuidTracker()

    # Get inputs
    inputs = []
    for read in splineJson["operations"]["reads"]:
        input_path = read["inputSources"][0].replace(
            notebookInfo["mounts"][0],
            "https://adldata.dfs.core.windows.net/data/")
        input = AtlasEntity(name=input_path.split("/")[-1],
                            typeName="azure_datalake_gen2_path",
                            qualified_name=input_path,
                            guid=guid.get_guid())
        inputs.append(input)

    # Get outputs
    write = splineJson["operations"]["write"]
    output_path = write["outputSource"].replace(
        notebookInfo["mounts"][0],
        "https://adldata.dfs.core.windows.net/data/")
    output = AtlasEntity(name=output_path.split("/")[-1],
                         typeName="azure_datalake_gen2_path",
                         qualified_name=output_path,
                         guid=guid.get_guid())

    # Get Process
    process_attributes = {
        "name": notebookInfo["name"],
        "owner": notebookInfo["user"],
        "description": f"Link to spark job notebook: http://{notebookURL}",
        "startTime": notebookInfo["timestamp"],
        "endTime": notebookInfo["timestamp"]
    }
    process = AtlasProcess(name=notebookInfo["name"],
                           typeName="Process",
                           qualified_name=f"adb-{notebookURL[4:20]}",
                           inputs=inputs,
                           outputs=[output],
                           guid=guid.get_guid(),
                           attributes=process_attributes)

    purview_lineage = inputs + [output] + [process]
    return purview_lineage
Beispiel #4
0
def test_min_entity_json_no_guid_usage():

    ae = AtlasEntity("BeforeModi",
                     "DataSet",
                     "tests://EntityRESTBeforeModification",
                     guid=-1)

    assignments = client.upload_entities([ae])["guidAssignments"]
    assign_with_no_guid = {}
    try:
        # live_table = client.get_entity(guid=assignments["-1"])["entities"][0]
        ae_no_guid = AtlasEntity("BeforeModi",
                                 "DataSet",
                                 "tests://EntityRESTBeforeModification",
                                 guid=None)

        proc1 = AtlasProcess("WillBeUpdatedWithNoGuidEntity",
                             "Process",
                             "tests://EntityRESTBeforeModificationProc",
                             inputs=[ae_no_guid],
                             outputs=[],
                             guid=-2)
        assign_with_no_guid = client.upload_entities([proc1
                                                      ])["guidAssignments"]

        live_proc = client.get_entity(
            guid=assign_with_no_guid["-2"])["entities"][0]

        # Should have one input that matches the guid assignment
        assert (len(live_proc["attributes"]["inputs"]) == 1)
        assert (
            live_proc["attributes"]["inputs"][0]["guid"] == assignments["-1"])

    finally:
        # Delete the entities now that the test is complete
        _ = client.delete_entity(assignments["-1"])
        if "-2" in assign_with_no_guid:
            _ = client.delete_entity(assign_with_no_guid.get("-2"))
        pass
Beispiel #5
0
# Create an asset for the input data frame.
atlas_input_df = AtlasEntity(
    name="demo_dbfs_delays_data",
    qualified_name="pyapacheatlas://demo_dbfs_delays_data",
    typeName="custom_spark_dataframe",
    guid=guid.get_guid(),
)

# Create a process that represents our notebook and has our input
# dataframe as one of the inputs.
process = AtlasProcess(
    name="demo_cluster" + notebook_path,
    qualified_name="pyapacheatlas://demo_cluster" + notebook_path,
    typeName="custom_spark_job_process",
    guid=guid.get_guid(),
    attributes={"job_type": "notebook"},
    inputs=[atlas_input_df],
    outputs=[
    ]  # No outputs for this demo, but otherwise, repeat what you did you the input dataframe.
)

# Iterate over the input data frame's columns and create them.
# Note: This is an add, not a delete. If the dataframe already exists in
# Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan'
# columns. They will continue to exist and point to the dataframe.
atlas_input_df_columns = []
for column in df.schema:
    temp_column = AtlasEntity(
        name=column.name,
        typeName="custom_spark_dataframe_column",
        qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name,
Beispiel #6
0
import json
import os

from pyapacheatlas.auth import BasicAuthentication, ServicePrincipalAuthentication
from pyapacheatlas.core import AtlasClient, AtlasEntity, AtlasProcess, PurviewClient
from pyapacheatlas.readers import ExcelConfiguration, ExcelReader

ae_in = AtlasEntity("test_in", "hive_table", "test://lineage_hive_in", -101)
ae_out = AtlasEntity("test_out", "hive_table", "test://lineage_hive_out", -102)
proc = AtlasProcess("test_proc", "Process", "test://lineage_hive_out", guid=-103,
                    inputs=[ae_in], outputs=[ae_out]
                    )
LINEAGE_BATCH = [ae_in, ae_out, proc]

auth = BasicAuthentication(username="******", password="******")
client = AtlasClient(endpoint_url="http://localhost:21000/api/atlas/v2",
                     authentication=auth)

oauth = ServicePrincipalAuthentication(
    tenant_id=os.environ.get("TENANT_ID", ""),
    client_id=os.environ.get("CLIENT_ID", ""),
    client_secret=os.environ.get("CLIENT_SECRET", "")
)
purview_client = PurviewClient(
    account_name=os.environ.get("PURVIEW_NAME", ""),
    authentication=oauth
)


def test_lineage_atlas():
Beispiel #7
0
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput01",
        guid=-100
    )
    output01 = AtlasEntity(
        name="output01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demooutput01",
        guid=-101
    )

    # The Atlas Process is the lineage component that links the two
    # entities together. The inputs and outputs need to be the "header"
    # version of the atlas entities, so specify minimum = True to
    # return just guid, qualifiedName, and typeName.
    process = AtlasProcess(
        name="sample process",
        typeName="Process",
        qualified_name="pyapacheatlas://democustomprocess",
        inputs=[input01.to_json(minimum=True)],
        outputs=[output01.to_json(minimum=True)],
        guid=-102
    )

    # Convert the individual entities into json before uploading.
    results = client.upload_entities(
        batch=[output01.to_json(), input01.to_json(), process.to_json()]
    )

    print(json.dumps(results, indent=2))
Beispiel #8
0
                           authentication=oauth)

    # Create two entities with AtlasEntity
    # You must provide a name, typeName, qualified_name, and guid
    # the guid must be a negative number and unique in your batch
    # being uploaded.
    input01 = AtlasEntity(name="input01",
                          typeName="DataSet",
                          qualified_name="pyapacheatlas://demoinput01",
                          guid=-100)
    output01 = AtlasEntity(name="output01",
                           typeName="DataSet",
                           qualified_name="pyapacheatlas://demooutput01",
                           guid=-101)

    # The Atlas Process is the lineage component that links the two
    # entities together. The inputs and outputs need to be the "header"
    # version of the atlas entities, so specify minimum = True to
    # return just guid, qualifiedName, and typeName.
    process = AtlasProcess(name="sample process",
                           typeName="Process",
                           qualified_name="pyapacheatlas://democustomprocess",
                           inputs=[input01],
                           outputs=[output01],
                           guid=-102)

    # Convert the individual entities into json before uploading.
    results = client.upload_entities(batch=[output01, input01, process])

    print(json.dumps(results, indent=2))
# MAGIC %md
# MAGIC ##### 6. Upload Notebook mapping into Purview

# COMMAND ----------

maps = spark.read.option("header","true").csv("/mnt/datafiles/purview/notebook_mapping.csv")
for map in maps.rdd.collect():
  nbname = map.notebook.split('/')[-1]
  print("Adding :"+nbname)
  InputEntity = client.get_entity(
        qualifiedName=[map.source],
        typeName= 'azure_datalake_gen2_path'
    )
  OutputEntity = client.get_entity(
        qualifiedName=[map.target],
        typeName="databricks_table"
    )
  job_process = AtlasProcess(
  name=nbname,
  qualified_name = "databricks://"+v_databricks_domain+"/notebooks/"+nbname,
  typeName="databricks_job",
  guid=guid.get_guid(),
  attributes = {"job_type":"notebook","notebook_path":map.notebook},
  inputs = [InputEntity.get("entities")[0]],
  outputs = [OutputEntity.get("entities")[0]] )

  client.upload_entities(job_process)

  
Beispiel #10
0
            {"Source": "*", "Sink": "Out01UniqueField3"},
            {"Source": "*", "Sink": "Out01UniqueField4"}],
            "DatasetMapping": {"Source":"*","Sink": colMapOutput01.qualifiedName}
         },
         # This is another example of the above special case for an input object
         {"ColumnMapping": [
            {"Source": "*", "Sink": "In01UniqueField"},
            {"Source": "*", "Sink": "In01UniqueField2"}],
            "DatasetMapping": {"Source": "*", "Sink": colMapInput01.qualifiedName}
         }
    ]

    # Create the process with the stringified column mapping json.
    process = AtlasProcess(
        name="test process",
        typeName="ProcessWithColumnMapping",
        qualified_name="pyapacheatlas://colMapOutputProcessDemo",
        inputs=[colMapInput01, colMapInput02],
        outputs=[colMapOutput01],
        guid=gt.get_guid(),
        attributes={"columnMapping": json.dumps(column_mapping)}
    )

    results = client.upload_entities(
        [process, colMapInput01, colMapInput02, colMapOutput01]
    )

    print(json.dumps(results, indent=2))
    sink_guid = results["guidAssignments"][str(colMapOutput01.guid)]
    print(f"Search for \"{colMapOutput01.name}\" or use the guid {sink_guid} to look up the sink table.")
Beispiel #11
0
    # existing_process.outputs = [ new_output ]

    # # Convert the individual entities into json before uploading.
    # results = client.upload_entities(
    #     batch = [new_output.to_json(), existing_process.to_json()]
    # )

    # print(json.dumps(results, indent=2))

    print("Starting Append Scenario...")
    # A second scenario would have us appending to an existing process
    # To do that, we need to query for the existing entity
    dummy_existing_process = AtlasProcess(
        name="sample_process_xyz",
        typeName="Process",
        qualified_name="pyapacheatlas://democustomprocess",
        inputs=None,  # Set to None so no update will occur
        outputs=None,  # We will update this with .outputs below
        guid=-104)

    real_existing_process = client.get_entity(
        typeName="Process",
        qualifiedName="pyapacheatlas://democustomprocess")["entities"][0]
    print("Working with process guid: {}".format(
        real_existing_process["guid"]))

    # Get the list of existing outputs from the attributes.
    existing_outputs = real_existing_process["attributes"]["outputs"]

    # Create one more output to be added.
    one_more_output = AtlasEntity(