Esempio n. 1
0
def test_peek():
    gt = GuidTracker(-100, "decrease")
    peek_results = gt.peek_next_guid()
    results = gt.get_guid()

    expected = -101

    assert(expected == results)
    assert(results == peek_results)
Esempio n. 2
0
def test_guid_tracker_get_and_decrement():

    gt = GuidTracker(-100, "decrease")
    results = gt.get_guid()

    expected = -101

    assert(expected == results)
    
    second_expected = -102
    second_results = gt.get_guid()

    assert(second_expected == second_results)
Esempio n. 3
0
def test_batches_entities_with_real_guid():
    gt = GuidTracker()
    a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid())
    b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid())
    b.addRelationship(table=a)

    c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid())
    d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid())
    c.addRelationship(tester={"guid": "abc-123"})

    entities = [x.to_json() for x in [a, b, c, d]]
    results = batch_dependent_entities(entities, batch_size=2)

    assert (len(results) == 2)
Esempio n. 4
0
def convert_Spline_to_Purview(splineJson):
    splineJson = json.loads(splineJson)

    # Get notebook info
    notebookInfo = splineJson["extraInfo"]["notebookInfo"]["obj"]
    notebookURL = notebookInfo["notebookURL"].replace("\\", "")

    guid = GuidTracker()

    # Get inputs
    inputs = []
    for read in splineJson["operations"]["reads"]:
        input_path = read["inputSources"][0].replace(
            notebookInfo["mounts"][0],
            "https://adldata.dfs.core.windows.net/data/")
        input = AtlasEntity(name=input_path.split("/")[-1],
                            typeName="azure_datalake_gen2_path",
                            qualified_name=input_path,
                            guid=guid.get_guid())
        inputs.append(input)

    # Get outputs
    write = splineJson["operations"]["write"]
    output_path = write["outputSource"].replace(
        notebookInfo["mounts"][0],
        "https://adldata.dfs.core.windows.net/data/")
    output = AtlasEntity(name=output_path.split("/")[-1],
                         typeName="azure_datalake_gen2_path",
                         qualified_name=output_path,
                         guid=guid.get_guid())

    # Get Process
    process_attributes = {
        "name": notebookInfo["name"],
        "owner": notebookInfo["user"],
        "description": f"Link to spark job notebook: http://{notebookURL}",
        "startTime": notebookInfo["timestamp"],
        "endTime": notebookInfo["timestamp"]
    }
    process = AtlasProcess(name=notebookInfo["name"],
                           typeName="Process",
                           qualified_name=f"adb-{notebookURL[4:20]}",
                           inputs=inputs,
                           outputs=[output],
                           guid=guid.get_guid(),
                           attributes=process_attributes)

    purview_lineage = inputs + [output] + [process]
    return purview_lineage
Esempio n. 5
0
# Add your credentials here or set them as environment variables
tenant_id = ""
client_id = ""
client_secret = ""
purview_account_name = ""

# COMMAND ----------

oauth = ServicePrincipalAuthentication(
    tenant_id=os.environ.get("TENANT_ID", tenant_id),
    client_id=os.environ.get("CLIENT_ID", client_id),
    client_secret=os.environ.get("CLIENT_SECRET", client_secret))
client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME",
                                                   "purview_account_name"),
                       authentication=oauth)
guid = GuidTracker()

# COMMAND ----------

# Set up a few types and relationships
# This is a one time thing but necessary to make the demo work
# It also demonstrates how you can capture different attributes
# for your dataframes, dataframe columns, and jobs.
type_spark_df = EntityTypeDef(name="custom_spark_dataframe",
                              attributeDefs=[AtlasAttributeDef(name="format")],
                              superTypes=["DataSet"],
                              options={"schemaElementAttribute": "columns"})
type_spark_columns = EntityTypeDef(
    name="custom_spark_dataframe_column",
    attributeDefs=[AtlasAttributeDef(name="data_type")],
    superTypes=["DataSet"],
Esempio n. 6
0
    args = parser.parse_args()

    oauth_old = ServicePrincipalAuthentication(
        tenant_id=config["OldClient"]["TENANT_ID"],
        client_id=config["OldClient"]["CLIENT_ID"],
        client_secret=config["OldClient"]["CLIENT_SECRET"])
    old_client = AtlasClient(endpoint_url=config["OldClient"]["ENDPOINT_URL"],
                             authentication=oauth_old)
    oauth_new = ServicePrincipalAuthentication(
        tenant_id=config["NewClient"]["TENANT_ID"],
        client_id=config["NewClient"]["CLIENT_ID"],
        client_secret=config["NewClient"]["CLIENT_SECRET"])
    new_client = AtlasClient(endpoint_url=config["NewClient"]["ENDPOINT_URL"],
                             authentication=oauth_new)

    gt = GuidTracker()

    # Export the glossary terms
    if not args.skip_download:
        print("Exporting the old glossary terms")
        glossary_terms = old_client.get_glossary(detailed=True)
        glossary_terms_copy = list(glossary_terms["termInfo"].values())
        with open(unchanged_path, 'w') as fp:
            json.dump(glossary_terms_copy, fp)

    else:
        print("Loading existing glossary terms from disk")
        with open(unchanged_path, 'r') as fp:
            glossary_terms_copy = json.load(fp)

    # Discover the Relationship Guids that will be uploaded later
Esempio n. 7
0
def test_batches_entities_dependent():
    gt = GuidTracker()
    a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid())
    b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid())
    b.addRelationship(table=a)
    c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid())
    d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid())
    c.addRelationship(parent=b)
    d.addRelationship(parent=b)
    e = AtlasEntity("E", "DataSet", "E", guid=gt.get_guid())
    e.addRelationship(table=a)
    f = AtlasEntity("F", "DataSet", "F", guid=gt.get_guid())
    g = AtlasEntity("G", "DataSet", "G", guid=gt.get_guid())
    g.addRelationship(table=f)
    h = AtlasEntity("H", "DataSet", "H", guid=gt.get_guid())
    h.addRelationship(parent=g)
    # Intentionally out of order
    j = AtlasEntity("J", "DataSet", "J", guid=gt.get_guid())
    k = AtlasEntity("K", "DataSet", "K", guid=gt.get_guid())
    i = AtlasEntity("I", "DataSet", "I", guid=gt.get_guid())

    i.addRelationship(colA=j)
    i.addRelationship(colB=k)

    l = AtlasEntity("L", "DataSet", "L", guid=gt.get_guid())
    m = AtlasEntity("M", "DataSet", "M", guid=gt.get_guid())
    n = AtlasEntity("N", "DataSet", "N", guid=gt.get_guid())
    o = AtlasEntity("O", "DataSet", "O", guid=gt.get_guid())
    p = AtlasEntity("P", "DataSet", "P", guid=gt.get_guid())

    entities = [
        x.to_json() for x in [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p]
    ]
    results = batch_dependent_entities(entities, batch_size=7)
    # There are sixteen results, batch size of 7 means at least three groups
    # One group has seven connected
    # One group should have only three
    # All others are independent
    assert (len(results) == 3)
Esempio n. 8
0
from pyapacheatlas.auth import ServicePrincipalAuthentication
from pyapacheatlas.core import PurviewClient, AtlasEntity, AtlasProcess, TypeCategory
from pyapacheatlas.core.util import GuidTracker
from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef
from pyapacheatlas.readers import ExcelConfiguration, ExcelReader

# The above cell gets the v_tenant_id,v_client_id etc.

auth = ServicePrincipalAuthentication(tenant_id=v_tenant_id,
                                      client_id=v_client_id,
                                      client_secret=v_client_secret)

# Create a client to connect to your service.
client = PurviewClient(account_name=v_data_catalog_name, authentication=auth)

guid = GuidTracker()

# COMMAND ----------

# Search for the entity you want to delete
import json
import os
search = client.search_entities("loan_risk_data.csv")
for page in search:
    print(json.dumps(page, indent=2))

# COMMAND ----------

# MAGIC %md
# MAGIC #####3. Bulk delete upto 50 entities
            "type": column_entity_def.name,
            "name": "table",
            "isContainer": False,
            "cardinality": "SINGLE",
            "isLegacyAttribute": False
        })

    # Upload the results
    upload_results = client.upload_typedefs(
        entityDefs=[column_entity_def, table_entity_def],
        relationshipDefs=[table_column_relationship],
        force_update=True)

    # With all the types and relationships defined, we can create entities.
    # We can use a GuidTracker to always get a unique negative number
    gt = GuidTracker()

    table_entity = AtlasEntity(
        name="sample_table",
        qualified_name="pyapacheatlas://sample_tablepyapacheatlas_custom_type",
        typeName="pyapacheatlas_demo_table",
        guid=gt.get_guid())

    # Add two columns. They must include the "relationshipAttribute" attribute.
    column01 = AtlasEntity(
        name="column01",
        typeName="pyapacheatlas_demo_column",
        qualified_name=
        "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column01",
        attributes={
            "data_type": "string",
Esempio n. 10
0
    args = parser.parse_args()

    oauth_old = ServicePrincipalAuthentication(
        tenant_id=config["OldClient"]["TENANT_ID"],
        client_id=config["OldClient"]["CLIENT_ID"],
        client_secret=config["OldClient"]["CLIENT_SECRET"])
    old_client = AtlasClient(endpoint_url=config["OldClient"]["ENDPOINT_URL"],
                             authentication=oauth_old)
    oauth_new = ServicePrincipalAuthentication(
        tenant_id=config["NewClient"]["TENANT_ID"],
        client_id=config["NewClient"]["CLIENT_ID"],
        client_secret=config["NewClient"]["CLIENT_SECRET"])
    new_client = AtlasClient(endpoint_url=config["NewClient"]["ENDPOINT_URL"],
                             authentication=oauth_new)

    gt = GuidTracker(starting=-50000)
    # Get all the types you want to "export"
    list_of_types_to_consider = [
        "demo_column", "demo_table", "demo_table_columns", "demo_process",
        "demo_column_lineage"
    ]

    indicators = ["guid"]

    # Export the list of types to consider
    if not args.skip_download:
        print("Searching through entities")
        export_records(old_client, folder_path, list_of_types_to_consider)

    # Discover the Relationship Guids that will be uploaded later
    print("Discovering guids to remap from disk")
Esempio n. 11
0
                counter = counter + 1

        if len(buffer) > 0:
            with open(os.path.join(relationships_path, f"batch-last.json"),
                      'w') as fp:
                json.dump(buffer, fp)

    # Now we can read load the remapping files and
    # Load in the remapping files
    print('Remapping guids')
    remapped_relationships = remap_guids(old_to_new_guids, relationships_path,
                                         output_path)

    # Clean up the remapped relationship and upload one by one...
    # This will take a while...
    gt = GuidTracker()
    counter = 0
    skipped = 0
    total_relationships = len(remapped_relationships)
    for relationship in remapped_relationships:
        inner_relationship = relationship["relationship"]
        inner_relationship["guid"] = str(gt.get_guid())
        # Pop attributes that break the upload
        inner_relationship.pop("updateTime")
        inner_relationship.pop("lastModifiedTS")
        inner_relationship.pop("updatedBy")
        inner_relationship.pop("createTime")
        inner_relationship.pop("createdBy")
        counter = counter + 1
        try:
            results = new_client.upload_relationship(inner_relationship)
Esempio n. 12
0
    )
    old_client = AtlasClient(
        endpoint_url=config["OldClient"]["ENDPOINT_URL"],
        authentication=oauth_old
    )
    oauth_new = ServicePrincipalAuthentication(
        tenant_id=config["NewClient"]["TENANT_ID"],
        client_id=config["NewClient"]["CLIENT_ID"],
        client_secret=config["NewClient"]["CLIENT_SECRET"]
    )
    new_client = AtlasClient(
        endpoint_url=config["NewClient"]["ENDPOINT_URL"],
        authentication=oauth_new
    )

    gt = GuidTracker()

    # Export the glossary terms
    if not args.skip_download:
        print("Exporting the old glossary terms")
        glossary_terms = old_client.get_glossary(detailed=True)
        glossary_terms_copy = list(glossary_terms["termInfo"].values())
        with open(unchanged_path, 'w') as fp:
            json.dump(glossary_terms_copy, fp)

    else:
        print("Loading existing glossary terms from disk")
        with open(unchanged_path, 'r') as fp:
            glossary_terms_copy = json.load(fp)

    # Discover the Relationship Guids that will be uploaded later