Ejemplo n.º 1
0
def test_set_relationship_different_ways():

    ae = AtlasEntity("rel01","hive_table", "tests://rel01", guid=-1)
    c1 = AtlasEntity("rel01#01", "hive_column", "tests://rel01#c", guid=-2, attributes={"type":"str"})
    c2 = AtlasEntity("rel01#02", "hive_column", "tests://rel02#c", guid=-3, attributes={"type":"str"})
    c3 = AtlasEntity("rel01#03", "hive_column", "tests://rel03#c", guid=-4, attributes={"type":"str"})
    c4 = AtlasEntity("rel01#04", "hive_column", "tests://rel04#c", guid=-5, attributes={"type":"str"})

    # Add c1 as the only relationship
    ae.addRelationship(columns=[c1.to_json(minimum=True)])

    c2.relationshipAttributes.update({"table": ae.to_json(minimum=True) })
    c3.addRelationship(table = ae)

    assignments = client.upload_entities([ae, c1, c2, c3, c4])["guidAssignments"]
    try:
        live_table = client.get_entity(guid=assignments["-1"])["entities"][0]
        
        # Should have two attributes because one is from the table having the
        # relationship defined as an array of columns and the second two from
        # the column's having the table relationshipAttribute defined on them.
        assert(len(live_table["relationshipAttributes"]["columns"]) == 3)

        relationship = {
                    "typeName": "hive_table_columns",
                    "attributes": {},
                    "guid": -100,
                    # Ends are either guid or guid + typeName 
                    # (in case there are ambiguities?)
                    "end1": {
                        "guid": assignments["-1"]
                    },
                    "end2": {
                        "guid": assignments["-5"]
                    }
                }

        relation_upload = client.upload_relationship(relationship)
        # Check that we have one more relationship
        # There are caching issues here :-(
        live_table_post_relationship = client.get_entity(guid=assignments["-1"])["entities"][0]
        assert(len(live_table["relationshipAttributes"]["columns"]) == 4)

    finally:
        # Need to delete all columns BEFORE you delete the table
        for local_id in [str(s) for s in range(-5,0)]:
            guid = assignments[local_id]
            _ = client.delete_entity(guid)
Ejemplo n.º 2
0
def test_prepare_bulk_entity_from_mixed_atlas_entity_dict():

    class_entity = AtlasEntity(
        name=sample_entity["attributes"]["name"],
        typeName=sample_entity["typeName"],
        qualified_name=sample_entity["attributes"]["qualifiedName"],
        attributes=sample_entity["attributes"],
        guid=sample_entity["guid"],
        relationshipAttributes=sample_entity["relationshipAttributes"])
    class_entity2 = AtlasEntity(
        name=sample_entity["attributes"]["name"] + "abc",
        typeName=sample_entity["typeName"],
        qualified_name=sample_entity["attributes"]["qualifiedName"] + "abc",
        attributes=sample_entity["attributes"],
        guid=sample_entity["guid"],
        relationshipAttributes=sample_entity["relationshipAttributes"])

    results = AtlasClient._prepare_entity_upload(
        [class_entity, class_entity2.to_json()])

    sample2 = sample_entity.copy()
    sample2["attributes"]["name"] = sample2["attributes"]["name"] + "abc"
    sample2["attributes"][
        "qualifiedName"] = sample2["attributes"]["qualifiedName"] + "abc"

    expected = {"entities": [sample_entity, sample2]}

    assert (results == expected)
    input01 = AtlasEntity(name="demoinput01",
                          qualified_name="demoinput01",
                          guid=-5000,
                          typeName="DataSet")
    output01 = AtlasEntity(name="demooutput01",
                           qualified_name="demooutput01",
                           guid=-5001,
                           typeName="DataSet")

    parent = AtlasEntity(name="my_complex_workflow",
                         qualified_name="process_xyz",
                         typeName="process_with_steps",
                         guid=-1003,
                         relationshipAttributes={
                             "steps": [
                                 step01.to_json(minimum=True),
                                 step02.to_json(minimum=True),
                                 step03.to_json(minimum=True),
                             ]
                         },
                         attributes={
                             "inputs": [input01.to_json(minimum=True)],
                             "outputs": [output01.to_json(minimum=True)]
                         })

    # Create a batch of entities to be uploaded to Purview
    batch = [step01, step02, step03, parent, input01, output01]

    # Upload the types
    typeResults = client.upload_typedefs(
        entityDefs=[processWithSteps, processSteps],
Ejemplo n.º 4
0
    outputs=[
    ]  # No outputs for this demo, but otherwise, repeat what you did you the input dataframe.
)

# Iterate over the input data frame's columns and create them.
# Note: This is an add, not a delete. If the dataframe already exists in
# Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan'
# columns. They will continue to exist and point to the dataframe.
atlas_input_df_columns = []
for column in df.schema:
    temp_column = AtlasEntity(
        name=column.name,
        typeName="custom_spark_dataframe_column",
        qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name,
        guid=guid.get_guid(),
        attributes={"data_type": str(column.dataType)},
        relationshipAttributes={
            "dataframe": atlas_input_df.to_json(minimum=True)
        })
    atlas_input_df_columns.append(temp_column)

# COMMAND ----------

# Prepare all the entities as a batch to be uploaded.
batch = [process, atlas_input_df] + atlas_input_df_columns

# COMMAND ----------

# Upload all entities!
client.upload_entities(batch=batch)
Ejemplo n.º 5
0
    # being uploaded.
    input01 = AtlasEntity(
        name="input01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput01",
        guid=-100
    )
    input02 = AtlasEntity(
        name="input02",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput02",
        guid=-101
    )

    results = client.upload_entities(
        batch=[input01.to_json(), input02.to_json()]
    )

    # Get the Guids for us to work with
    guids = [v for v in results["guidAssignments"].values()]

    # Classify one entity with multiple classifications
    print(f"Adding multiple classifications to guid: {guids[0]}")
    one_entity_multi_class = client.classify_entity(
        guid=guids[0], 
        classifications=[
            AtlasClassification("MICROSOFT.PERSONAL.DATE_OF_BIRTH").to_json(),
            AtlasClassification("MICROSOFT.PERSONAL.NAME").to_json()
            ],
        force_update=True
    )
Ejemplo n.º 6
0
        typeName="DataSet",
        qualified_name="pyapacheatlas://demoinput01",
        guid=-100
    )
    output01 = AtlasEntity(
        name="output01",
        typeName="DataSet",
        qualified_name="pyapacheatlas://demooutput01",
        guid=-101
    )

    # The Atlas Process is the lineage component that links the two
    # entities together. The inputs and outputs need to be the "header"
    # version of the atlas entities, so specify minimum = True to
    # return just guid, qualifiedName, and typeName.
    process = AtlasProcess(
        name="sample process",
        typeName="Process",
        qualified_name="pyapacheatlas://democustomprocess",
        inputs=[input01.to_json(minimum=True)],
        outputs=[output01.to_json(minimum=True)],
        guid=-102
    )

    # Convert the individual entities into json before uploading.
    results = client.upload_entities(
        batch=[output01.to_json(), input01.to_json(), process.to_json()]
    )

    print(json.dumps(results, indent=2))
atlas_input_df = AtlasEntity(
    name="demo_dbfs_delays_data",
    qualified_name="pyapacheatlas://demo_dbfs_delays_data",
    typeName="custom_spark_dataframe",
    guid=guid.get_guid(),
)

# Create a process that represents our notebook and has our input
# dataframe as one of the inputs.
process = AtlasProcess(
    name="demo_cluster" + notebook_path,
    qualified_name="pyapacheatlas://demo_cluster" + notebook_path,
    typeName="custom_spark_job_process",
    guid=guid.get_guid(),
    attributes={"job_type": "notebook"},
    inputs=[atlas_input_df.to_json(minimum=True)],
    outputs=[
    ]  # No outputs for this demo, but otherwise, repeat what you did you the input dataframe.
)

# Iterate over the input data frame's columns and create them.
# Note: This is an add, not a delete. If the dataframe already exists in
# Atlas/Data Catalog, this sample is not smart enough to prune any 'orphan'
# columns. They will continue to exist and point to the dataframe.
atlas_input_df_columns = []
for column in df.schema:
    temp_column = AtlasEntity(
        name=column.name,
        typeName="custom_spark_dataframe_column",
        qualified_name="pyapacheatlas://demo_dbfs_delays_data#" + column.name,
        guid=guid.get_guid(),
Ejemplo n.º 8
0
                     "tests://rel02#c",
                     guid=-3,
                     attributes={"type": "str"})
    c3 = AtlasEntity("rel01#03",
                     "hive_column",
                     "tests://rel03#c",
                     guid=-4,
                     attributes={"type": "str"})
    c4 = AtlasEntity("rel01#04",
                     "hive_column",
                     "tests://rel04#c",
                     guid=-5,
                     attributes={"type": "str"})

    # Add c1 as the only relationship to the table
    table.addRelationship(columns=[c1.to_json(minimum=True)])

    c2.relationshipAttributes.update({"table": table.to_json(minimum=True)})
    c3.addRelationship(table=table)

    assignments = client.upload_entities([table, c1, c2, c3,
                                          c4])["guidAssignments"]

    try:
        live_table = client.get_entity(guid=assignments["-1"])["entities"][0]

        # Should have two attributes because one is from the table having the
        # relationship defined as an array of columns and the second two from
        # the column's having the table relationshipAttribute defined on them.
        print("Here's what the upload looks like!")
        print(json.dumps(live_table["relationshipAttributes"], indent=2))
    # we get to parsing the spreadsheet so we have something to work with.
    # This is not necessary if you are working with existing entities.
    inputTable = AtlasEntity(
        name="demo_hive_source",
        typeName="hive_table",
        qualified_name="pyapacheatlas://demo_update_lineage_input",
        guid=-100)
    outputTable = AtlasEntity(
        name="demo_hive_target",
        typeName="hive_table",
        qualified_name="pyapacheatlas://demo_update_lineage_output",
        guid=-101)
    # Upload these entities so we have something to work with
    # This will throw and exception if something goes wrong, otherwise
    # throw out the resulting json.
    _ = client.upload_entities([inputTable.to_json(), outputTable.to_json()])

    # Create an empty excel template to be populated
    excel_reader.make_template(file_path)
    # This is just a helper to fill in some demo data
    fill_in_workbook(file_path, excel_config)

    # ACTUAL WORK: This parses our excel file and creates a batch to upload
    lineage_processes = excel_reader.parse_update_lineage(file_path)

    # This is what is getting sent to your Atlas server
    # print(json.dumps(lineage_processes,indent=2))

    results = client.upload_entities(lineage_processes)

    print(json.dumps(results, indent=2))
    input01 = AtlasEntity(name="demoinput01",
                          qualified_name="demoinput01",
                          guid=-5000,
                          typeName="DataSet")
    output01 = AtlasEntity(name="demooutput01",
                           qualified_name="demooutput01",
                           guid=-5001,
                           typeName="DataSet")

    parent = AtlasEntity(name="my_complex_workflow",
                         qualified_name="process_xyz",
                         typeName="process_with_steps",
                         guid=-1003,
                         relationshipAttributes={
                             "steps": [
                                 step01.to_json(minimum=True),
                                 step02.to_json(minimum=True),
                                 step03.to_json(minimum=True),
                             ]
                         },
                         attributes={
                             "inputs": [input01.to_json(minimum=True)],
                             "outputs": [output01.to_json(minimum=True)]
                         })

    # Create a batch of entities to be uploaded as json/dicts
    batch = [
        step01.to_json(),
        step02.to_json(),
        step03.to_json(),
        parent.to_json(),