def test_peek(): gt = GuidTracker(-100, "decrease") peek_results = gt.peek_next_guid() results = gt.get_guid() expected = -101 assert(expected == results) assert(results == peek_results)
def test_guid_tracker_get_and_decrement(): gt = GuidTracker(-100, "decrease") results = gt.get_guid() expected = -101 assert(expected == results) second_expected = -102 second_results = gt.get_guid() assert(second_expected == second_results)
def test_batches_entities_with_real_guid(): gt = GuidTracker() a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid()) b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid()) b.addRelationship(table=a) c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid()) d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid()) c.addRelationship(tester={"guid": "abc-123"}) entities = [x.to_json() for x in [a, b, c, d]] results = batch_dependent_entities(entities, batch_size=2) assert (len(results) == 2)
def convert_Spline_to_Purview(splineJson): splineJson = json.loads(splineJson) # Get notebook info notebookInfo = splineJson["extraInfo"]["notebookInfo"]["obj"] notebookURL = notebookInfo["notebookURL"].replace("\\", "") guid = GuidTracker() # Get inputs inputs = [] for read in splineJson["operations"]["reads"]: input_path = read["inputSources"][0].replace( notebookInfo["mounts"][0], "https://adldata.dfs.core.windows.net/data/") input = AtlasEntity(name=input_path.split("/")[-1], typeName="azure_datalake_gen2_path", qualified_name=input_path, guid=guid.get_guid()) inputs.append(input) # Get outputs write = splineJson["operations"]["write"] output_path = write["outputSource"].replace( notebookInfo["mounts"][0], "https://adldata.dfs.core.windows.net/data/") output = AtlasEntity(name=output_path.split("/")[-1], typeName="azure_datalake_gen2_path", qualified_name=output_path, guid=guid.get_guid()) # Get Process process_attributes = { "name": notebookInfo["name"], "owner": notebookInfo["user"], "description": f"Link to spark job notebook: http://{notebookURL}", "startTime": notebookInfo["timestamp"], "endTime": notebookInfo["timestamp"] } process = AtlasProcess(name=notebookInfo["name"], typeName="Process", qualified_name=f"adb-{notebookURL[4:20]}", inputs=inputs, outputs=[output], guid=guid.get_guid(), attributes=process_attributes) purview_lineage = inputs + [output] + [process] return purview_lineage
# Add your credentials here or set them as environment variables tenant_id = "" client_id = "" client_secret = "" purview_account_name = "" # COMMAND ---------- oauth = ServicePrincipalAuthentication( tenant_id=os.environ.get("TENANT_ID", tenant_id), client_id=os.environ.get("CLIENT_ID", client_id), client_secret=os.environ.get("CLIENT_SECRET", client_secret)) client = PurviewClient(account_name=os.environ.get("PURVIEW_NAME", "purview_account_name"), authentication=oauth) guid = GuidTracker() # COMMAND ---------- # Set up a few types and relationships # This is a one time thing but necessary to make the demo work # It also demonstrates how you can capture different attributes # for your dataframes, dataframe columns, and jobs. type_spark_df = EntityTypeDef(name="custom_spark_dataframe", attributeDefs=[AtlasAttributeDef(name="format")], superTypes=["DataSet"], options={"schemaElementAttribute": "columns"}) type_spark_columns = EntityTypeDef( name="custom_spark_dataframe_column", attributeDefs=[AtlasAttributeDef(name="data_type")], superTypes=["DataSet"],
args = parser.parse_args() oauth_old = ServicePrincipalAuthentication( tenant_id=config["OldClient"]["TENANT_ID"], client_id=config["OldClient"]["CLIENT_ID"], client_secret=config["OldClient"]["CLIENT_SECRET"]) old_client = AtlasClient(endpoint_url=config["OldClient"]["ENDPOINT_URL"], authentication=oauth_old) oauth_new = ServicePrincipalAuthentication( tenant_id=config["NewClient"]["TENANT_ID"], client_id=config["NewClient"]["CLIENT_ID"], client_secret=config["NewClient"]["CLIENT_SECRET"]) new_client = AtlasClient(endpoint_url=config["NewClient"]["ENDPOINT_URL"], authentication=oauth_new) gt = GuidTracker() # Export the glossary terms if not args.skip_download: print("Exporting the old glossary terms") glossary_terms = old_client.get_glossary(detailed=True) glossary_terms_copy = list(glossary_terms["termInfo"].values()) with open(unchanged_path, 'w') as fp: json.dump(glossary_terms_copy, fp) else: print("Loading existing glossary terms from disk") with open(unchanged_path, 'r') as fp: glossary_terms_copy = json.load(fp) # Discover the Relationship Guids that will be uploaded later
def test_batches_entities_dependent(): gt = GuidTracker() a = AtlasEntity("A", "DataSet", "A", guid=gt.get_guid()) b = AtlasEntity("B", "DataSet", "B", guid=gt.get_guid()) b.addRelationship(table=a) c = AtlasEntity("C", "DataSet", "C", guid=gt.get_guid()) d = AtlasEntity("D", "DataSet", "D", guid=gt.get_guid()) c.addRelationship(parent=b) d.addRelationship(parent=b) e = AtlasEntity("E", "DataSet", "E", guid=gt.get_guid()) e.addRelationship(table=a) f = AtlasEntity("F", "DataSet", "F", guid=gt.get_guid()) g = AtlasEntity("G", "DataSet", "G", guid=gt.get_guid()) g.addRelationship(table=f) h = AtlasEntity("H", "DataSet", "H", guid=gt.get_guid()) h.addRelationship(parent=g) # Intentionally out of order j = AtlasEntity("J", "DataSet", "J", guid=gt.get_guid()) k = AtlasEntity("K", "DataSet", "K", guid=gt.get_guid()) i = AtlasEntity("I", "DataSet", "I", guid=gt.get_guid()) i.addRelationship(colA=j) i.addRelationship(colB=k) l = AtlasEntity("L", "DataSet", "L", guid=gt.get_guid()) m = AtlasEntity("M", "DataSet", "M", guid=gt.get_guid()) n = AtlasEntity("N", "DataSet", "N", guid=gt.get_guid()) o = AtlasEntity("O", "DataSet", "O", guid=gt.get_guid()) p = AtlasEntity("P", "DataSet", "P", guid=gt.get_guid()) entities = [ x.to_json() for x in [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p] ] results = batch_dependent_entities(entities, batch_size=7) # There are sixteen results, batch size of 7 means at least three groups # One group has seven connected # One group should have only three # All others are independent assert (len(results) == 3)
from pyapacheatlas.auth import ServicePrincipalAuthentication from pyapacheatlas.core import PurviewClient, AtlasEntity, AtlasProcess, TypeCategory from pyapacheatlas.core.util import GuidTracker from pyapacheatlas.core.typedef import AtlasAttributeDef, EntityTypeDef, RelationshipTypeDef from pyapacheatlas.readers import ExcelConfiguration, ExcelReader # The above cell gets the v_tenant_id,v_client_id etc. auth = ServicePrincipalAuthentication(tenant_id=v_tenant_id, client_id=v_client_id, client_secret=v_client_secret) # Create a client to connect to your service. client = PurviewClient(account_name=v_data_catalog_name, authentication=auth) guid = GuidTracker() # COMMAND ---------- # Search for the entity you want to delete import json import os search = client.search_entities("loan_risk_data.csv") for page in search: print(json.dumps(page, indent=2)) # COMMAND ---------- # MAGIC %md # MAGIC #####3. Bulk delete upto 50 entities
"type": column_entity_def.name, "name": "table", "isContainer": False, "cardinality": "SINGLE", "isLegacyAttribute": False }) # Upload the results upload_results = client.upload_typedefs( entityDefs=[column_entity_def, table_entity_def], relationshipDefs=[table_column_relationship], force_update=True) # With all the types and relationships defined, we can create entities. # We can use a GuidTracker to always get a unique negative number gt = GuidTracker() table_entity = AtlasEntity( name="sample_table", qualified_name="pyapacheatlas://sample_tablepyapacheatlas_custom_type", typeName="pyapacheatlas_demo_table", guid=gt.get_guid()) # Add two columns. They must include the "relationshipAttribute" attribute. column01 = AtlasEntity( name="column01", typeName="pyapacheatlas_demo_column", qualified_name= "pyapacheatlas://sample_tablepyapacheatlas_custom_type@column01", attributes={ "data_type": "string",
args = parser.parse_args() oauth_old = ServicePrincipalAuthentication( tenant_id=config["OldClient"]["TENANT_ID"], client_id=config["OldClient"]["CLIENT_ID"], client_secret=config["OldClient"]["CLIENT_SECRET"]) old_client = AtlasClient(endpoint_url=config["OldClient"]["ENDPOINT_URL"], authentication=oauth_old) oauth_new = ServicePrincipalAuthentication( tenant_id=config["NewClient"]["TENANT_ID"], client_id=config["NewClient"]["CLIENT_ID"], client_secret=config["NewClient"]["CLIENT_SECRET"]) new_client = AtlasClient(endpoint_url=config["NewClient"]["ENDPOINT_URL"], authentication=oauth_new) gt = GuidTracker(starting=-50000) # Get all the types you want to "export" list_of_types_to_consider = [ "demo_column", "demo_table", "demo_table_columns", "demo_process", "demo_column_lineage" ] indicators = ["guid"] # Export the list of types to consider if not args.skip_download: print("Searching through entities") export_records(old_client, folder_path, list_of_types_to_consider) # Discover the Relationship Guids that will be uploaded later print("Discovering guids to remap from disk")
counter = counter + 1 if len(buffer) > 0: with open(os.path.join(relationships_path, f"batch-last.json"), 'w') as fp: json.dump(buffer, fp) # Now we can read load the remapping files and # Load in the remapping files print('Remapping guids') remapped_relationships = remap_guids(old_to_new_guids, relationships_path, output_path) # Clean up the remapped relationship and upload one by one... # This will take a while... gt = GuidTracker() counter = 0 skipped = 0 total_relationships = len(remapped_relationships) for relationship in remapped_relationships: inner_relationship = relationship["relationship"] inner_relationship["guid"] = str(gt.get_guid()) # Pop attributes that break the upload inner_relationship.pop("updateTime") inner_relationship.pop("lastModifiedTS") inner_relationship.pop("updatedBy") inner_relationship.pop("createTime") inner_relationship.pop("createdBy") counter = counter + 1 try: results = new_client.upload_relationship(inner_relationship)
) old_client = AtlasClient( endpoint_url=config["OldClient"]["ENDPOINT_URL"], authentication=oauth_old ) oauth_new = ServicePrincipalAuthentication( tenant_id=config["NewClient"]["TENANT_ID"], client_id=config["NewClient"]["CLIENT_ID"], client_secret=config["NewClient"]["CLIENT_SECRET"] ) new_client = AtlasClient( endpoint_url=config["NewClient"]["ENDPOINT_URL"], authentication=oauth_new ) gt = GuidTracker() # Export the glossary terms if not args.skip_download: print("Exporting the old glossary terms") glossary_terms = old_client.get_glossary(detailed=True) glossary_terms_copy = list(glossary_terms["termInfo"].values()) with open(unchanged_path, 'w') as fp: json.dump(glossary_terms_copy, fp) else: print("Loading existing glossary terms from disk") with open(unchanged_path, 'r') as fp: glossary_terms_copy = json.load(fp) # Discover the Relationship Guids that will be uploaded later