Esempio n. 1
0
def test_guid_tracker_get_and_decrement():

    gt = GuidTracker(-100, "decrease")
    results = gt.get_guid()

    expected = -101

    assert(expected == results)
    
    second_expected = -102
    second_results = gt.get_guid()

    assert(second_expected == second_results)
Esempio n. 2
0
def test_batches_entities_with_real_guid():
    gt = GuidTracker()
    a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid())
    b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid())
    b.addRelationship(table=a)

    c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid())
    d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid())
    c.addRelationship(tester={"guid": "abc-123"})

    entities = [x.to_json() for x in [a, b, c, d]]
    results = batch_dependent_entities(entities, batch_size=2)

    assert (len(results) == 2)
Esempio n. 3
0
def convert_Spline_to_Purview(splineJson):
    splineJson = json.loads(splineJson)

    # Get notebook info
    notebookInfo = splineJson["extraInfo"]["notebookInfo"]["obj"]
    notebookURL = notebookInfo["notebookURL"].replace("\\", "")

    guid = GuidTracker()

    # Get inputs
    inputs = []
    for read in splineJson["operations"]["reads"]:
        input_path = read["inputSources"][0].replace(
            notebookInfo["mounts"][0],
            "https://adldata.dfs.core.windows.net/data/")
        input = AtlasEntity(name=input_path.split("/")[-1],
                            typeName="azure_datalake_gen2_path",
                            qualified_name=input_path,
                            guid=guid.get_guid())
        inputs.append(input)

    # Get outputs
    write = splineJson["operations"]["write"]
    output_path = write["outputSource"].replace(
        notebookInfo["mounts"][0],
        "https://adldata.dfs.core.windows.net/data/")
    output = AtlasEntity(name=output_path.split("/")[-1],
                         typeName="azure_datalake_gen2_path",
                         qualified_name=output_path,
                         guid=guid.get_guid())

    # Get Process
    process_attributes = {
        "name": notebookInfo["name"],
        "owner": notebookInfo["user"],
        "description": f"Link to spark job notebook: http://{notebookURL}",
        "startTime": notebookInfo["timestamp"],
        "endTime": notebookInfo["timestamp"]
    }
    process = AtlasProcess(name=notebookInfo["name"],
                           typeName="Process",
                           qualified_name=f"adb-{notebookURL[4:20]}",
                           inputs=inputs,
                           outputs=[output],
                           guid=guid.get_guid(),
                           attributes=process_attributes)

    purview_lineage = inputs + [output] + [process]
    return purview_lineage
Esempio n. 4
0
def test_peek():
    gt = GuidTracker(-100, "decrease")
    peek_results = gt.peek_next_guid()
    results = gt.get_guid()

    expected = -101

    assert(expected == results)
    assert(results == peek_results)
Esempio n. 5
0
def test_batches_entities_dependent():
    gt = GuidTracker()
    a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid())
    b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid())
    b.addRelationship(table=a)
    c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid())
    d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid())
    c.addRelationship(parent=b)
    d.addRelationship(parent=b)
    e = AtlasEntity("E", "DataSet", "E", guid=gt.get_guid())
    e.addRelationship(table=a)
    f = AtlasEntity("F", "DataSet", "F", guid=gt.get_guid())
    g = AtlasEntity("G", "DataSet", "G", guid=gt.get_guid())
    g.addRelationship(table=f)
    h = AtlasEntity("H", "DataSet", "H", guid=gt.get_guid())
    h.addRelationship(parent=g)
    # Intentionally out of order
    j = AtlasEntity("J", "DataSet", "J", guid=gt.get_guid())
    k = AtlasEntity("K", "DataSet", "K", guid=gt.get_guid())
    i = AtlasEntity("I", "DataSet", "I", guid=gt.get_guid())

    i.addRelationship(colA=j)
    i.addRelationship(colB=k)

    l = AtlasEntity("L", "DataSet", "L", guid=gt.get_guid())
    m = AtlasEntity("M", "DataSet", "M", guid=gt.get_guid())
    n = AtlasEntity("N", "DataSet", "N", guid=gt.get_guid())
    o = AtlasEntity("O", "DataSet", "O", guid=gt.get_guid())
    p = AtlasEntity("P", "DataSet", "P", guid=gt.get_guid())

    entities = [
        x.to_json() for x in [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p]
    ]
    results = batch_dependent_entities(entities, batch_size=7)
    # There are sixteen results, batch size of 7 means at least three groups
    # One group has seven connected
    # One group should have only three
    # All others are independent
    assert (len(results) == 3)
Esempio n. 6
0
# COMMAND ----------

# Now we begin to do some Atlas uploads using the types created above.
# Get the notebook path as it will be part of our process' name.
notebook_path = dbutils.notebook.entry_point.getDbutils().notebook(
).getContext().notebookPath().get()

# COMMAND ----------

# Create an asset for the input data frame.
atlas_input_df = AtlasEntity(
    name="demo_dbfs_delays_data",
    qualified_name="pyapacheatlas://demo_dbfs_delays_data",
    typeName="custom_spark_dataframe",
    guid=guid.get_guid(),
)

# Create a process that represents our notebook and has our input
# dataframe as one of the inputs.
process = AtlasProcess(
    name="demo_cluster" + notebook_path,
    qualified_name="pyapacheatlas://demo_cluster" + notebook_path,
    typeName="custom_spark_job_process",
    guid=guid.get_guid(),
    attributes={"job_type": "notebook"},
    inputs=[atlas_input_df],
    outputs=[
    ]  # No outputs for this demo, but otherwise, repeat what you did you the input dataframe.
)
Esempio n. 7
0
    print("Getting the new glossary id")
    new_glossary = new_client.get_glossary("Glossary")
    new_glossary_guid = new_glossary["guid"]
    print(f"The new glossary guid is: {new_glossary_guid}")

    print("Writing glossary terms in prep to be re-mapped to temporary guids")
    with open(glossary_prep_path, 'w') as fp:
        json.dump(glossary_terms_copy, fp)

    print("Discovering guids and remapping guids")
    # Remap the guids and write them back out to the output_path
    old_glossary_guids = discover_guids([glossary_prep_path],
                                        ["guid", "termGuid"])
    # Provide a new guid (temp guid) for the upload (must be a negative number)
    orig_guid_to_temp_guid = {
        g: str(gt.get_guid())
        for g in old_glossary_guids
    }
    # Execute the find and replace of old guid with temp guid
    remapped_glossary = remap_guids(orig_guid_to_temp_guid,
                                    [glossary_prep_path], output_path)

    print("Processing the glossary terms in memory")
    headers = [
        "antonyms", "classifies", "isA", "preferredTerms", "preferredToTerms",
        "replacedBy", "replacementTerms", "seeAlso", "synonyms",
        "translatedTerms", "translationTerms", "validValues", "validValuesFor"
    ]

    for term in remapped_glossary:
        # Remove everything that will be created automatically
    # Upload the results
    upload_results = client.upload_typedefs(
        entityDefs=[column_entity_def, table_entity_def],
        relationshipDefs=[table_column_relationship],
        force_update=True)

    # With all the types and relationships defined, we can create entities.
    # We can use a GuidTracker to always get a unique negative number
    gt = GuidTracker()

    table_entity = AtlasEntity(
        name="sample_table",
        qualified_name="pyapacheatlas://sample_tablepyapacheatlas_custom_type",
        typeName="pyapacheatlas_demo_table",
        guid=gt.get_guid())

    # Add two columns. They must include the "relationshipAttribute" attribute.
    column01 = AtlasEntity(
        name="column01",
        typeName="pyapacheatlas_demo_column",
        qualified_name=
        "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column01",
        attributes={
            "data_type": "string",
            "description": "This is the first column."
        },
        guid=gt.get_guid())
    column02 = AtlasEntity(
        name="column02",
        typeName="pyapacheatlas_demo_column",
Esempio n. 9
0
    if not args.skip_download:
        print("Searching through entities")
        export_records(old_client, folder_path, list_of_types_to_consider)

    # Discover the Relationship Guids that will be uploaded later
    print("Discovering guids to remap from disk")
    relationship_guids = discover_guids(folder_path, ["relationshipGuid"])
    with open(relationships_guid_path, 'a+') as fp:
        for relationship in relationship_guids:
            fp.write(relationship)
            fp.write("\n")

    # Get Guids and replace them
    print("Remapping guids from disk and into memory")
    old_entity_guids = discover_guids(folder_path, indicators)
    orig_guid_to_temp_guid = {g: str(gt.get_guid()) for g in old_entity_guids}
    remapped_entities = remap_guids(orig_guid_to_temp_guid, folder_path,
                                    output_path)

    print("Processing entities in memory")
    for entity in remapped_entities:
        # Strip the relationshipAttributes, they will be added later.
        entity["relationshipAttributes"] = {}
        entity.pop("lastModifiedTS")
        entity.pop("createdBy")
        entity.pop("updatedBy")
        entity.pop("createTime")
        entity.pop("updateTime")

    input("Ready to upload entities. Continue?  Ctrl + C to back out now.")
    # Upload and get results["guidAssignments"] for new guids
Esempio n. 10
0
        column_term = column_term_root + '@' + args.glossary
        table_name = row["tbl_it_name"]
        table_type = row["tbl_it_type"]
        column_name = row["fld_it_name"]
        column_type = row["fld_it_type"]
        table_desc = row["tbl_description"]
        column_desc = row["fld_description"]
        column_data_type = row["fld_type"]

        known_terms.add(table_term)
        known_terms.add(column_term)

        table = AtlasEntity(name=table_term_root,
                            qualified_name=table_name,
                            typeName=table_type,
                            guid=gt.get_guid(),
                            attributes={"description": table_desc})

        column = AtlasEntity(name=column_term_root,
                             qualified_name=column_name,
                             typeName=column_type,
                             guid=gt.get_guid(),
                             attributes={
                                 "description": column_desc,
                                 "data_type": column_data_type
                             })

        known_entities.add(table)
        known_entities.add(column)

        row_pairs = [(table, table_term), (column, column_term)]
Esempio n. 11
0
    # Now we can read load the remapping files and
    # Load in the remapping files
    print('Remapping guids')
    remapped_relationships = remap_guids(old_to_new_guids, relationships_path,
                                         output_path)

    # Clean up the remapped relationship and upload one by one...
    # This will take a while...
    gt = GuidTracker()
    counter = 0
    skipped = 0
    total_relationships = len(remapped_relationships)
    for relationship in remapped_relationships:
        inner_relationship = relationship["relationship"]
        inner_relationship["guid"] = str(gt.get_guid())
        # Pop attributes that break the upload
        inner_relationship.pop("updateTime")
        inner_relationship.pop("lastModifiedTS")
        inner_relationship.pop("updatedBy")
        inner_relationship.pop("createTime")
        inner_relationship.pop("createdBy")
        counter = counter + 1
        try:
            results = new_client.upload_relationship(inner_relationship)
        except Exception as e:
            with open(os.path.join(relationships_path, '_deadletter.txt'),
                      'a+') as fp:
                fp.write(f"{str(e)}\n")
                skipped = skipped + 1
        print(f"Completed {counter}/{total_relationships} {skipped} skipped.")
Esempio n. 12
0
        ]
    )

    # Upload the type definition
    type_results = client.upload_typedefs(entityDefs=[procType], force_update=True)
    print(json.dumps(type_results,indent=2))

    # Set up a guid tracker to make it easier to generate negative guids
    gt = GuidTracker()

    # Now we can create the entities, we will have two inputs and one output
    colMapInput01 = AtlasEntity(
        "Input for Column Mapping",
        "hive_table",
        "pyapacheatlas://colMapInput01",
        guid=gt.get_guid()
    )
    colMapInput02 = AtlasEntity(
        "Second Input for Column Mapping",
        "hive_table",
        "pyapacheatlas://colMapInput02",
        guid=gt.get_guid()
    )
    colMapOutput01 = AtlasEntity(
        "Output for Column Mapping",
        "hive_table",
        "pyapacheatlas://colMapOutput01",
        guid=gt.get_guid()
    )
    
    # Now we can define the column mapping object that will be 'stringified'
Esempio n. 13
0
    # including the relationship attributes
    print("Getting the new glossary id")
    new_glossary = new_client.get_glossary("Glossary")
    new_glossary_guid = new_glossary["guid"]
    print(f"The new glossary guid is: {new_glossary_guid}")

    print("Writing glossary terms in prep to be re-mapped to temporary guids")
    with open(glossary_prep_path, 'w') as fp:
        json.dump(glossary_terms_copy, fp)

    print("Discovering guids and remapping guids")
    # Remap the guids and write them back out to the output_path
    old_glossary_guids = discover_guids(
        [glossary_prep_path], ["guid", "termGuid"])
    # Provide a new guid (temp guid) for the upload (must be a negative number)
    orig_guid_to_temp_guid = {g: str(gt.get_guid())
                              for g in old_glossary_guids}
    # Execute the find and replace of old guid with temp guid
    remapped_glossary = remap_guids(orig_guid_to_temp_guid, [
                                    glossary_prep_path], output_path)
    
    print("Processing the glossary terms in memory")
    headers = ["antonyms", "classifies", "isA", "preferredTerms",
               "preferredToTerms", "replacedBy", "replacementTerms",
               "seeAlso", "synonyms", "translatedTerms", "translationTerms",
               "validValues", "validValuesFor"
               ]
    
    for term in remapped_glossary:
        # Remove everything that will be created automatically
        try: