コード例 #1
0
def processItem(anItem, itemCount):
    ''' 
    put your code here - that does something with the item
    '''
    itemId = anItem["id"]
    itemName = edcutils.getFactValue(anItem, "core.name")
    print("\titem " + str(itemCount) + "=" + itemId + " name=" + itemName)
コード例 #2
0
def processItem(anItem, itemCount):
    '''
    put your code here - that does something with the item
    for this example, just print the it and name
    @note python 2.7 does not allow us to specify the parameter type...
    '''
    itemId = anItem["id"]
    itemName = edcutils.getFactValue(anItem, "core.name")
    print("\titem " + str(itemCount) + "=" + itemId + " name=" + itemName)
コード例 #3
0
def getSchemaContents(schemaName, schemaType, resourceName):
    """
    given a schema name, schema class type (e.g. hanadb is different)
    and resource name, find the schema object
    then
        execute a relationships call to get the schema tables & columns
        (parent/child links)
        note:  some models separate primary key columns from regular columns
        note:  some models have different relationships (e.g. sap hana db)

    returns a dictionary of all tables & columns for the schema & the id of
    the schema object
    key=table  val=tableid
    key=table.column  val=columnid
    """
    print("\tgetSchemaContents for:" + schemaName + " resource=" +
          resourceName)
    # schemaDict returned  key=TABLE.COLUMN value=column id
    schemaDict = {}
    tableNames = {}

    # url = catalogServer + "/access/2/catalog/data/objects"
    url = edcHelper.baseUrl + "/access/2/catalog/data/objects"
    query = (f'+core.resourceName:"{resourceName}"' +
             f' +core.classType:"{schemaType}"' +
             f' +core.name:"{schemaName}"')
    parameters = {"q": query, "offset": 0, "pageSize": 1}
    print("\tquery=" + query)

    schemaId = None
    tableCount = 0
    columnCount = 0
    # make the call to find the schema object
    response = edcHelper.session.get(url, params=parameters, timeout=3)
    print(f"session get finished: {response.status_code}")
    rc = response.status_code

    if rc != 200:
        print("error reading object: rc=" + str(rc) + " response:" +
              str(response.json))
        if rc == 401:
            print("\t401:Possible Missing/bad credentials")
            print(str(response))
        return

    # get the total # of objects returned (first part of the json resultset)
    totalObjects = response.json()["metadata"]["totalCount"]
    print("\tobjects returned: " + str(totalObjects))

    for item in response.json()["items"]:
        schemaId = item["id"]
        schemaName = edcutils.getFactValue(item, "core.name")
        # get the tables & columns
        print("\tfound schema: " + schemaName + " id=" + schemaId)

        lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships"
        lineageParms = {
            "seed": schemaId,
            "association": "core.ParentChild",
            "depth": "2",
            "direction": "OUT",
            "includeAttribute": {"core.name", "core.classType"},
            "includeTerms": "false",
            "removeDuplicateAggregateLinks": "false",
        }
        print("\tGET child rels for schema: " + lineageURL + " parms=" +
              str(lineageParms))
        # get using uid/pwd
        lineageResp = edcHelper.session.get(
            lineageURL,
            params=lineageParms,
        )
        lineageStatus = lineageResp.status_code
        print("\tlineage resp=" + str(lineageStatus))
        if lineageStatus != 200:
            print(f"error getting schema contents (tables) rc={rc}"
                  f" response:{response.json}")
            if rc == 401:
                print("\t401:Possible Missing/bad credentials")
                print(str(response))
            return

        if lineageResp.text.startswith("{items:"):
            # bug (10.2.0 & 10.2.1) - the items collection should be "items"
            lineageJson = lineageResp.text.replace("items", '"items"', 1)
        else:
            lineageJson = lineageResp.text
        # relsJson = json.loads(lineageJson.replace('items', '"items"'))
        relsJson = json.loads(lineageJson)
        # print(len(relsJson))

        for lineageItem in relsJson["items"]:
            # print('\t\t' + str(lineageItem))
            inId = lineageItem.get("inId")
            outId = lineageItem.get("outId")

            # print('new inId===' + inId + " outId=" + outId)
            # print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name"))
            assocId = lineageItem.get("associationId")
            # print("\t\t" + inId + " assoc=" + assocId)
            # if assocId=='com.infa.ldm.relational.SchemaTable':
            if assocId.endswith(".SchemaTable"):
                # note - custom lineage does not need table and column
                # count the tables & store table names
                tableCount += 1
                # tableName = inId.split('/')[-1]
                tableName = edcutils.getFactValue(lineageItem["inEmbedded"],
                                                  "core.name").lower()
                # store the table name (for lookup when processing the columns)
                # key=id, val=name
                tableNames[inId] = tableName
                schemaDict[tableName] = inId
            # if assocId=='com.infa.ldm.relational.TableColumn':
            if assocId.endswith(".TableColumn") or assocId.endswith(
                    ".TablePrimaryKeyColumn"):
                # columnName = inId.split('/')[-1]
                columnCount += 1
                columnName = edcutils.getFactValue(lineageItem["inEmbedded"],
                                                   "core.name").lower()
                tableName = tableNames[outId].lower()
                # print("column=" + tableName + "." + columnName)
                schemaDict[tableName + "." + columnName] = inId

    print("\tgetSchema: returning " + str(columnCount) + " columns, in " +
          str(tableCount) + " tables")
    return schemaDict, schemaId
コード例 #4
0
def main():
    """
    main starts here - run the query processing all items

    - find all com.infa.ldm.relational.ExternalDatabase objects
      - for each - call processExternalDB
    """
    print("ExternalDBLinker started")
    start_time = time.time()

    tableLinksCreated = 0
    columnLinksCreated = 0
    errorsFound = 0
    """
    logger = logging.getLogger('externalDBLinker')
    logger.setLevel('DEBUG')

    file_log_handler = logging.FileHandler('logfile.log')
    logger.addHandler(file_log_handler)

    stderr_log_handler = logging.StreamHandler()
    logger.addHandler(stderr_log_handler)

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_log_handler.setFormatter(formatter)
    stderr_log_handler.setFormatter(formatter)

    logger.info('Info message')
    logger.error('Error message')
    """

    columnHeader = [
        "Association",
        "From Connection",
        "To Connection",
        "From Object",
        "To Object",
    ]
    if str(platform.python_version()).startswith("2.7"):
        fCSVFile = open(outputFile, "w")
    else:
        fCSVFile = open(outputFile, "w", newline="", encoding="utf-8")
    print("custom lineage file initialized. " + outputFile)
    colWriter = csv.writer(fCSVFile)
    colWriter.writerow(columnHeader)

    print("executing query to find all external DB objects: " + url + " q=" +
          parameters.get("q"))
    resp = requests.get(url,
                        params=parameters,
                        headers=header,
                        auth=HTTPBasicAuth(uid, pwd))
    status = resp.status_code
    print("extDB query rc=" + str(status))

    resultJson = resp.json()
    total = resultJson["metadata"]["totalCount"]
    print("external db objects found... " + str(total))
    currentDB = 0

    # for each externalDatabase object
    for extDBItem in resultJson["items"]:
        itemId = extDBItem["id"]
        currentDB += 1
        print("processing database: " + itemId + " " + str(currentDB) +
              " of " + str(total))
        itemType = edcutils.getFactValue(extDBItem, "core.classType")
        itemName = edcutils.getFactValue(extDBItem, "core.name")
        resourceName = edcutils.getFactValue(extDBItem, "core.resourceName")
        resourceType = edcutils.getFactValue(extDBItem, "core.resourceType")

        tabLinks, colLinks, errors = processExternalDB(itemId, itemType,
                                                       itemName, resourceType,
                                                       resourceName, colWriter)
        tableLinksCreated += tabLinks
        columnLinksCreated += colLinks
        errorsFound += errors

        sys.stdout.flush()

    fCSVFile.close
    print("finished!")
    print("table links:   created=" + str(tableLinksCreated))
    print("column links:  created=" + str(columnLinksCreated))
    print("errors found: " + str(errorsFound))

    # end of main()
    print("Finished - run time = %s seconds ---" % (time.time() - start_time))

    # call the resource create/update/load function to get the data imported into EDC
    if executeEDCImport:
        edcutils.createOrUpdateAndExecuteResource(
            catalogServer,
            uid,
            pwd,
            lineageResourceName,
            lineageResourceTemplate,
            csvFileName,
            outputFile,
            waitToComplete,
        )
コード例 #5
0
def processExternalDB(dbId, classType, dbName, resType, resName, colWriter):
    """
    dbId = the id of the external database object
    classType = the classtype of the external database oject
    dbNAme - the name of the external db object
    resType - the resource type e.g Oracle, SQLServer that created the
              external db object
    resName - the resource name that created the external db object
    colWriter - the csv File object to write any lineage entries

    process outline:-
    get lineage to the table level (2 parent/child hops) from the external database
        for each table, find the actual table that should be linked (case insensitive)
        if only 1 table was found (otherwise - print an error message)
            link the table and all columns
    """

    dbUnknown = False
    # counters for # of links created
    tabLinks = 0
    colLinks = 0
    errors = 0

    # 'External' can be used if the database name is not known (sqlserver use case)
    if dbName == "External":
        dbUnknown = True
        print("\tthe database for the externally referenced object is unknown")

    # note:  if the database type is oracle, we can't trust the name of the database
    #        since it will contain the dblink name, not the externally referenced db
    #        so we will treat it as unknown and just try to find the schema/table ref'd
    if resType == "Oracle":
        dbUnknown = True

    print("\ttype=" + classType + " name=" + dbName + " id=" + dbId +
          " unknownDB:" + str(dbUnknown))

    # get the lineage for the database object
    lineageURL = catalogServer + "/access/2/catalog/data/relationships"
    lineageParms = {
        "seed": dbId,
        "association": "core.ParentChild",
        "depth": "2",
        "direction": "OUT",
        "includeAttribute": {"core.name", "core.classType"},
        "includeTerms": "false",
        "removeDuplicateAggregateLinks": "false",
    }
    print("\tLineage query for: " + dbName + "  params=" + str(lineageParms))
    lineageResp = requests.get(lineageURL,
                               params=lineageParms,
                               headers=header,
                               auth=HTTPBasicAuth(uid, pwd))

    lineageStatus = lineageResp.status_code
    print("\tlineage rc=" + str(lineageStatus))
    lineageJson = lineageResp.text

    #  bug in the relationships api call - the items collection sould be
    #  "items" (with quotes)
    if lineageJson.startswith("{items"):
        # bug in 10.2.1 and before
        # replace items with "items" - for just the first occurrence
        # (note:  this is fixed after 10.2.1
        lineageJson = json.loads(lineageJson.replace("items", '"items"', 1))
    else:
        # 10.2.1u1 + json is escaped properly
        lineageJson = json.loads(lineageJson)

    # for each item in the lineage resultset
    for lineageItem in lineageJson["items"]:
        inId = lineageItem.get("inId")
        assocId = lineageItem.get("associationId")
        schemaName = ""
        # print("\t" + inId + " assoc=" + assocId)
        if assocId == "com.infa.ldm.relational.ExternalSchemaTable":
            # find the table...
            schemaName = inId.split("/")[-2]
            inEmbedded = lineageItem.get("inEmbedded")
            tableName = edcutils.getFactValue(inEmbedded, "core.name")
            print("\tprocessing table=" + tableName + " schema=" + schemaName +
                  " db=" + dbName + " id=" + inId)

            # format the query to find the actual table
            q = (
                "core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View)  and core.name_lc_exact:"
                + tableName)
            if dbUnknown and schemaName == "":
                q = q + " and core.resourceName:" + resName
            # if dbUnknown==False:
            #    q=q+ ' and ' + dbName
            q = q + ' and core.resourceType:"' + resType + '"'
            tableSearchParms = {"q": q, "offset": 0, "pageSize": 100}
            print("\t\tquery=" + str(tableSearchParms))
            # find the table - with name tableName
            tResp = requests.get(
                url,
                params=tableSearchParms,
                headers=header,
                auth=HTTPBasicAuth(uid, pwd),
            )
            tStatus = tResp.status_code
            print("\t\tquery rc=" + str(tStatus))
            # print("\t\t\t\t" + str(tResp.json()))
            foundTabId = ""
            tableMatchCount = 0
            # possible matching tables
            for tableItem in tResp.json()["items"]:
                fromTableId = tableItem["id"]
                fromSchemaName = fromTableId.split("/")[-2].lower()
                # foundTabId=''
                # foundTabId = tableItem["id"]
                # print("rs-" + fromTableId + " compared with " + inId);
                if fromTableId != inId and fromTableId.count("/") == 4:
                    # filter out comparing against itself +
                    # counting the / chars - 4 = normal db, 5=external db (disregard)
                    print("\t\tchecking " + fromTableId)
                    # print("could be this one...." + foundTabId + " << " + inId)
                    theName = fromTableId.split("/")[-1]

                    # the schema could be empty - in that case it should match dbo
                    # unless we have a way of knowing what the default schema is
                    if theName.lower() == tableName.lower() and (
                            schemaName.lower() == fromSchemaName.lower() or
                        (schemaName == "" and fromSchemaName == "dbo")):
                        tableMatchCount += 1
                        foundTabId = fromTableId
                        print("\t\ttable name matches..." + theName + "==" +
                              tableName + " " + inId + " " + foundTabId +
                              " count/" + str(fromTableId.count("/")))
                    else:
                        print(
                            "\t\tno match...schema match 1:" + theName +
                            " 2:" + tableName + " 3:" + schemaName + " 4:" +
                            fromSchemaName + ":" +
                            str(schemaName.lower() == fromSchemaName.lower()))
                # else:
                # print("skipping this one");

            print("\t\ttotal matching tables=" + str(tableMatchCount) +
                  " inId:" + inId)
            if tableMatchCount == 1:
                # ok - we have a single table to match
                # link at the table level - then get the columns and link them too
                # print("linking from actual tab - to external ref tab")
                print("\t\tlinking tables " + foundTabId + " -->> " + inId)
                # get the columns for the ext Table
                # (will be a reduced set - only the linked columns)
                extCols = getColumnsForTable(inId)
                tabCols = getColumnsForTable(foundTabId)

                # link the table level
                edcutils.exportLineageLink(foundTabId, inId,
                                           "core.DataSetDataFlow", colWriter)
                tabLinks += 1

                tabColsLinked = 0

                # match the columns on the left/right side
                for toCol, toId in extCols.items():
                    # check if the toCol (the name) exists in the tabCols dict
                    # print('\t\t\tchecking toCol:' + toCol + tabCols.get(toCol))
                    fromId = tabCols.get(toCol)
                    if fromId is not None:
                        # print('\t\t\tlinking columns...' + fromId + ' --->>>' + toId)
                        edcutils.exportLineageLink(fromId, toId,
                                                   "core.DirectionalDataFlow",
                                                   colWriter)
                        tabColsLinked += 1
                        colLinks += 1
                    else:
                        print("\t\t\tError: cannot find column " + toCol +
                              " in table " + inId)
                        errors += 1
                print("\t\t\text cols:" + str(len(extCols)) + " tablCols:" +
                      str(len(tabCols)) + " linked=" + str(tabColsLinked))
                # print("\t\t\tcolumns linked=" + str(tabColsLinked))
            else:
                print("\t\tmutlple possible matches found (" +
                      str(tableMatchCount) + ") no links will be created")
            # flush the console buffer - for tailing the stdout log
            sys.stdout.flush()

    print("external database: " + dbName +
          " processed: tab/col links created: " + str(tabLinks) + "/" +
          str(colLinks) + " errors:" + str(errors))
    print("")
    return tabLinks, colLinks, errors
コード例 #6
0
def getSchemaContents(schemaName, schemaType, resourceName):
    """
    given a schema name, schema class type (e.g. hanadb is different) and resource name
    find the schema object
    then
        execute a relationships call to get the schema tables & columns (parent/child links)
        note:  some models separate primary key columns from regular columns
        note:  some models have different relationships (e.g. sap hana db)
        
    returns a dictionary of all tables & columns for the schema & the id of the schema object
    key=table  val=tableid
    key=table.column  val=columnid
    """
    print('\tgetSchemaContents for:' + schemaName + " resource=" +
          resourceName)
    # schemaDict returned  key=TABLE.COLUMN value=column id
    schemaDict = {}
    tableNames = {}

    url = catalogServer + '/access/2/catalog/data/objects'
    query = "core.resourceName:" + resourceName + " and core.classType:" + schemaType + " and core.name_lc_exact:" + schemaName
    parameters = {'q': query, 'offset': 0, 'pageSize': 1}
    # header using uid/pwd (no Authorization)
    header = {"Accept": "application/json"}
    # header using Authorization - no need to use uid/pwd in the get call
    #header = {"Accept": "application/json", "Authorization": authCredentials}

    print("\tquery=" + query)
    #print("\theader=" + str(header))

    schemaId = None
    tableCount = 0
    columnCount = 0
    # make the call to find the schema object
    response = requests.get(url,
                            params=parameters,
                            headers=header,
                            auth=HTTPBasicAuth(uid, pwd))
    #response = requests.get(url,params=parameters,headers=header)
    rc = response.status_code
    if rc != 200:
        print("error reading object: rc=" + str(rc) + " response:" +
              str(response.json))
        if rc == 401:
            print(
                "\t401:Possible Missing/bad credentials - or server not found/responding"
            )
            print(str(response))
        return

    # get the total # of objects returned (first part of the json resultset)
    totalObjects = response.json()['metadata']['totalCount']
    print("\tobjects returned: " + str(totalObjects))

    for item in response.json()['items']:
        schemaId = item["id"]
        schemaName = edcutils.getFactValue(item, 'core.name')
        # get the tables & columns
        print("\tfound schema: " + schemaName + " id=" + schemaId)

        lineageURL = catalogServer + '/access/2/catalog/data/relationships'
        lineageParms = {
            "seed": schemaId,
            "association": "core.ParentChild",
            "depth": "2",
            "direction": "OUT",
            "includeAttribute": {'core.name', 'core.classType'},
            "includeTerms": "false",
            "removeDuplicateAggregateLinks": "false"
        }
        print("\tGET child rels for schema: " + lineageURL + " parms=" +
              str(lineageParms))
        # get using uid/pwd
        lineageResp = requests.get(lineageURL,
                                   params=lineageParms,
                                   headers=header,
                                   auth=HTTPBasicAuth(uid, pwd))
        # credentials are in the header
        #lineageResp = requests.get(lineageURL,params=lineageParms,headers=header)
        lineageStatus = lineageResp.status_code
        print("\tlineage resp=" + str(lineageStatus))
        if lineageStatus != 200:
            print("error getting schema contents (tables) rc=" + str(rc) +
                  " response:" + str(response.json))
            if rc == 401:
                print(
                    "\t401:Possible Missing/bad credentials - or server not found/responding"
                )
                print(str(response))
            return

        if lineageResp.text.startswith('{items:'):
            # bug (10.2.0 & 10.2.1) - the items collection should be "items"
            lineageJson = lineageResp.text.replace('items', '"items"', 1)
        else:
            lineageJson = lineageResp.text
        #relsJson = json.loads(lineageJson.replace('items', '"items"'))
        relsJson = json.loads(lineageJson)
        #print(len(relsJson))

        for lineageItem in relsJson["items"]:
            #print('\t\t' + str(lineageItem))
            inId = lineageItem.get("inId")
            outId = lineageItem.get("outId")

            #print('new inId===' + inId + " outId=" + outId)
            #print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name"))
            assocId = lineageItem.get("associationId")
            #print("\t\t" + inId + " assoc=" + assocId)
            #if assocId=='com.infa.ldm.relational.SchemaTable':
            if assocId.endswith('.SchemaTable'):
                # note - custom lineage does not need table and column - count the tables & store table names
                tableCount += 1
                #tableName = inId.split('/')[-1]
                tableName = edcutils.getFactValue(lineageItem["inEmbedded"],
                                                  "core.name").lower()
                # store the table name (for lookup when processing the columns) key-id, val=name
                tableNames[inId] = tableName
                schemaDict[tableName] = inId
            #if assocId=='com.infa.ldm.relational.TableColumn':
            if assocId.endswith('.TableColumn') or assocId.endswith(
                    ".TablePrimaryKeyColumn"):
                #columnName = inId.split('/')[-1]
                columnCount += 1
                columnName = edcutils.getFactValue(lineageItem["inEmbedded"],
                                                   "core.name").lower()
                tableName = tableNames[outId].lower()
                #print("column=" + tableName + "." + columnName)
                schemaDict[tableName + "." + columnName] = inId

    print("\tgetSchema: returning " + str(columnCount) + " columns, in " +
          str(tableCount) + " tables")
    return schemaDict, schemaId
コード例 #7
0
def main():
    """
    main starts here - run the query processing all items

    - find all com.infa.ldm.relational.ExternalDatabase objects
      - for each - call processExternalDB
    """
    print("ExternalDBLinker started")
    start_time = time.time()

    args = args, unknown = parser.parse_known_args()
    # setup edc session and catalog url - with auth in the session header,
    # by using system vars or command-line args
    edcHelper.initUrlAndSessionFromEDCSettings()
    print(f"command-line args parsed = {args} ")

    tableLinksCreated = 0
    columnLinksCreated = 0
    errorsFound = 0

    columnHeader = [
        "Association",
        "From Connection",
        "To Connection",
        "From Object",
        "To Object",
    ]
    outputFile = args.outDir + "/" + args.csvFileName
    fullpath = os.path.abspath(outputFile)
    fCSVFile = open(outputFile, "w", newline="", encoding="utf-8")
    from pathlib import Path

    print("custom lineage file initialized. " + outputFile + " RELATIVE=" +fullpath)
    colWriter = csv.writer(fCSVFile)
    colWriter.writerow(columnHeader)

    parameters = {
        "q": "core.classType:com.infa.ldm.relational.ExternalDatabase",
        "offset": 0,
        "pageSize": 1000,
    }
    url = edcHelper.baseUrl + "/access/2/catalog/data/objects"

    print(
        "executing query to find all external DB objects: "
        f"{url} q={parameters.get('q')} {parameters}"
    )
    resp = edcHelper.session.get(url, params=parameters)
    status = resp.status_code
    print("extDB query rc=" + str(status))

    if status != 200:
        print(f"error - expecting 200 rc, got {status} - message={resp.json()}")
        return

    resultJson = resp.json()
    total = resultJson["metadata"]["totalCount"]
    print(f"external db objects found... {total}")
    currentDB = 0

    # for each externalDatabase object
    for extDBItem in resultJson["items"]:
        itemId = extDBItem["id"]
        currentDB += 1
        print(f"processing database: {itemId} {currentDB} of {total}")
        itemType = edcutils.getFactValue(extDBItem, "core.classType")
        itemName = edcutils.getFactValue(extDBItem, "core.name")
        resourceName = edcutils.getFactValue(extDBItem, "core.resourceName")
        resourceType = edcutils.getFactValue(extDBItem, "core.resourceType")

        tabLinks, colLinks, errors = processExternalDB(
            itemId, itemType, itemName, resourceType, resourceName, colWriter
        )
        tableLinksCreated += tabLinks
        columnLinksCreated += colLinks
        errorsFound += errors

        sys.stdout.flush()

    fCSVFile.close()
    print("finished!")
    print("table links:   created=" + str(tableLinksCreated))
    print("column links:  created=" + str(columnLinksCreated))
    print("errors found: " + str(errorsFound))

    # end of main()
    print("Finished - run time = %s seconds ---" % (time.time() - start_time))

    # call the resource create/update/load function to get the data imported into EDC
    if args.edcimport and tableLinksCreated > 0:
        edcutils.createOrUpdateAndExecuteResourceUsingSession(
            edcHelper.baseUrl,
            edcHelper.session,
            args.lineageResourceName,
            args.lineageResourceTemplate,
            args.csvFileName,
            fullpath,
            waitToComplete,
            "LineageScanner"
        )
コード例 #8
0
def main():
    '''
    main starts here - run the query processing all items
    
    - find all com.infa.ldm.relational.ExternalDatabase objects
      - for each - call processExternalDB
    '''
    print("ExternalDBLinker started")
    start_time = time.time()

    tableLinksCreated = 0
    columnLinksCreated = 0
    errorsFound = 0

    columnHeader = [
        "Association", "From Connection", "To Connection", "From Object",
        "To Object"
    ]
    if str(platform.python_version()).startswith("2.7"):
        fCSVFile = open(csvFileName, "w")
    else:
        fCSVFile = open(csvFileName, "w", newline='', encoding='utf-8')
    print("custom lineage file initialized. " + csvFileName)
    colWriter = csv.writer(fCSVFile)
    colWriter.writerow(columnHeader)

    print("executing query to find all external DB objects: " + url + ' q=' +
          parameters.get('q'))
    resp = requests.get(url,
                        params=parameters,
                        headers=header,
                        auth=HTTPBasicAuth(uid, pwd))
    status = resp.status_code
    print('extDB query rc=' + str(status))

    resultJson = resp.json()
    total = resultJson['metadata']['totalCount']
    print("external db objects found... " + str(total))
    currentDB = 0

    # for each externalDatabase object
    for extDBItem in resultJson["items"]:
        itemId = extDBItem["id"]
        currentDB += 1
        print("processing database: " + itemId + " " + str(currentDB) +
              ' of ' + str(total))
        itemType = edcutils.getFactValue(extDBItem, "core.classType")
        itemName = edcutils.getFactValue(extDBItem, "core.name")
        resourceName = edcutils.getFactValue(extDBItem, "core.resourceName")
        resourceType = edcutils.getFactValue(extDBItem, "core.resourceType")

        tabLinks, colLinks, errors = processExternalDB(itemId, itemType,
                                                       itemName, resourceType,
                                                       resourceName, colWriter)
        tableLinksCreated += tabLinks
        columnLinksCreated += colLinks
        errorsFound += errors

    fCSVFile.close
    print("finished!")
    print("table links:   created=" + str(tableLinksCreated))
    print("column links:  created=" + str(columnLinksCreated))
    print("errors found: " + str(errorsFound))

    # end of main()
    print("Finished - run time = %s seconds ---" % (time.time() - start_time))
コード例 #9
0
def getSchemaContents(schemaName, schemaType, resourceName, substitute_chars):
    """
    given a schema name, schema class type (e.g. hanadb is different)
    and resource name, find the schema object
    then
        execute a relationships call to get the schema tables & columns
        (parent/child links)
        note:  some models separate primary key columns from regular columns
        note:  some models have different relationships (e.g. sap hana db)

    returns a dictionary of all tables & columns for the schema & the id of
    the schema object
    key=table  val=tableid
    key=table.column  val=columnid
    """
    print("\tgetSchemaContents for:" + schemaName + " resource=" + resourceName)
    # schemaDict returned  key=TABLE.COLUMN value=column id
    schemaDict = {}
    tableNames = {}

    # url = catalogServer + "/access/2/catalog/data/objects"
    url = edcHelper.baseUrl + "/access/2/catalog/data/objects"
    query = (f'+core.resourceName:"{resourceName}"'
        + f' +core.classType:"{schemaType}"'
        + f' +core.name:"{schemaName}"'
    )
    parameters = {"q": query, "offset": 0, "pageSize": 1}
    print("\tquery=" + query)

    schemaId = None
    tableCount = 0
    columnCount = 0
    # make the call to find the schema object
    response = edcHelper.session.get(url, params=parameters, timeout=3)
    print(f"session get finished: {response.status_code}")
    rc = response.status_code

    if rc != 200:
        print("error reading object: rc=" + str(rc) + " response:" + str(response.json))
        if rc == 401:
            print(
                "\t401:Possible Missing/bad credentials"
            )
            print(str(response))
        return

    # get the total # of objects returned (first part of the json resultset)
    totalObjects = response.json()["metadata"]["totalCount"]
    print("\tobjects returned: " + str(totalObjects))

    for item in response.json()["items"]:
        schemaId = item["id"]
        schemaNameFound = edcutils.getFactValue(item, "core.name")
        # get the tables & columns

        # check to see if schemaName found is an exact match
        # (for situations where there are other schemas with the same prefix)
        # e.g. search for "PUBLIC" will also return "PUBLIC_TEST"
        print("\tfound schema: " + schemaNameFound + " id=" + schemaId + " type=" + schemaType)
        if schemaNameFound != schemaName:
            print(f"schema {schemaNameFound} does not exactly match {schemaName}, skipping")
            continue

        lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships"
        depth = 2
        # for SAP Hana Calculation views - the package might contain sub-packages
        # set the depth to >2
        if schemaType == "com.infa.ldm.relational.SAPHanaPackage":
            depth = 10
            print(f"\tNote:  SAP Hana Package used for datasource (schema)\n\t\tsetting depth={depth} for relationships query")
        lineageParms = {
            "seed": schemaId,
            "association": "core.ParentChild",
            "depth": depth,
            "direction": "OUT",
            "includeAttribute": {"core.name", "core.classType"},
            "includeTerms": "false",
            "removeDuplicateAggregateLinks": "false",
        }




        print(
            "\tGET child rels for schema: " + lineageURL + " parms=" + str(lineageParms)
        )
        # get using uid/pwd
        lineageResp = edcHelper.session.get(
            lineageURL,
            params=lineageParms,
        )
        lineageStatus = lineageResp.status_code
        print("\tlineage resp=" + str(lineageStatus))
        if lineageStatus != 200:
            print(
                f"error getting schema contents (tables) rc={rc}"
                f" response:{response.json}"
            )
            if rc == 401:
                print(
                    "\t401:Possible Missing/bad credentials"
                )
                print(str(response))
            return

        if lineageResp.text.startswith("{items:"):
            # bug (10.2.0 & 10.2.1) - the items collection should be "items"
            lineageJson = lineageResp.text.replace("items", '"items"', 1)
        else:
            lineageJson = lineageResp.text
        # relsJson = json.loads(lineageJson.replace('items', '"items"'))
        relsJson = json.loads(lineageJson)
        # print(len(relsJson))



        for lineageItem in relsJson["items"]:
            # print('\t\t' + str(lineageItem))
            inId = lineageItem.get("inId")
            outId = lineageItem.get("outId")

            # print('new inId===' + inId + " outId=" + outId)
            # print(edcutils.getFactValue(lineageItem["inEmbedded"], "core.name"))
            assocId = lineageItem.get("associationId")
            # print("\t\t" + inId + " assoc=" + assocId)
            # if assocId=='com.infa.ldm.relational.SchemaTable':
            if (assocId.endswith(".SchemaTable")
               or assocId == "com.infa.adapter.snowflake.PackageFlatRecord_table"
               or assocId == "com.infa.ldm.relational.SAPHanaPackageCalculationView"):
                # note - custom lineage does not need table and column
                # count the tables & store table names
                tableCount += 1
                # tableName = inId.split('/')[-1]
                # tableName = edcutils.getFactValue(
                #     lineageItem["inEmbedded"], "core.name"
                # ).lower()
                # store the table name (for lookup when processing the columns)
                # key=id, val=name
                # tableNames[inId] = tableName
                # schemaDict[tableName] = inId
            # if assocId=='com.infa.ldm.relational.TableColumn':
            if (assocId.endswith(".TableColumn")
               or assocId.endswith(".TablePrimaryKeyColumn")
               or assocId == "com.infa.adapter.snowflake.FlatRecord_tableField"
               or assocId == "com.infa.ldm.relational.CalculationViewAttribute"):
                # columnName = inId.split('/')[-1]
                columnCount += 1
                columnName = edcutils.getFactValue(
                    lineageItem["inEmbedded"], "core.name"
                ).lower()
                # check if key exists??  possible bug or different order from relationships

                # check for substitutions
                columnName = (substitute_name(columnName, substitute_chars)).lower()
                # get table name from id (split and get the parent - won't work if a table / in the name)
                tableName = outId.split("/")[-1].lower()
                # print(f"table::: {outId}")
                # print(outId.split("/"))
                # tableName = tableNames[outId].lower()
                # print("column=" + tableName + "." + columnName)
                schemaDict[tableName + "." + columnName] = inId
                schemaDict[tableName] = outId

    print(
        "\tgetSchema: returning "
        + str(columnCount)
        + " columns, in "
        + str(tableCount)
        + " tables"
    )
    return schemaDict, schemaId
コード例 #10
0
def main():
    """
    main starts here - run the query processing all items

    """
    print("ExternalDBLinker started")
    start_time = time.time()

    args = args, unknown = parser.parse_known_args()
    # setup edc session and catalog url - with auth in the session header,
    # by using system vars or command-line args
    edcHelper.initUrlAndSessionFromEDCSettings()
    print(f"command-line args parsed = {args} ")

    tableLinksCreated = 0
    columnLinksCreated = 0
    errorsFound = 0

    columnHeader = [
        "Association",
        "From Connection",
        "To Connection",
        "From Object",
        "To Object",
    ]
    outputFile = args.outDir + "/" + args.csvFileName
    fullpath = os.path.abspath(outputFile)
    fCSVFile = open(outputFile, "w", newline="", encoding="utf-8")
    from pathlib import Path

    print("custom lineage file initialized. " + outputFile + " RELATIVE=" +fullpath)
    colWriter = csv.writer(fCSVFile)
    colWriter.writerow(columnHeader)

    query = f"+core.classType:com.infa.ldm.relational.View +core.name:\"{args.viewname}\""

    parameters = {
        "q": query,
        "offset": 0,
        "pageSize": 1000,
    }
    url = edcHelper.baseUrl + "/access/2/catalog/data/objects"

    print(
        f"executing query to find view named {args.viewname}: "
        f"{url} q={parameters.get('q')} {parameters}"
    )
    resp = edcHelper.session.get(url, params=parameters)
    status = resp.status_code
    print("extDB query rc=" + str(status))

    if status != 200:
        print(f"error - expecting 200 rc, got {status} - message={resp.json()}")
        return

    resultJson = resp.json()
    total = resultJson["metadata"]["totalCount"]
    print(f"external db objects found... {total}")

    id_list = list()

    # for each externalDatabase object
    for view in resultJson["items"]:
        itemId = view["id"]
        # print(f"\tview={itemId}")
        for dstlink in view['dstLinks']:
            print(f"\t{dstlink}")
            print(f"{dstlink.get('id')}")
            if dstlink['classType'] == 'com.infa.ldm.relational.ViewColumn':
                id_list.append(dstlink['id'])

    print(f"view columns found - used for lineage seed ids...{len(id_list)}")

    # get the lineage for the database object
    lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships"
    lineageParms = {
        "seed": id_list,
        "association": "core.DirectionalDataFlow",
        "depth": "1",
        "direction": "IN",
        "includeAttribute": {"core.name", "core.classType"},
        "includeTerms": "false",
        "removeDuplicateAggregateLinks": "false",
    }
    print(f"\tLineage query for: {args.viewname} params={lineageParms}")
    lineageResp = edcHelper.session.get(
        lineageURL, params=lineageParms,
    )

    lineageStatus = lineageResp.status_code
    print(f"\tlineage rc={lineageStatus}")

    # print(f"\n\n--------")
    # print(lineageResp.json())

    columnHeader = [
        "left_id",
        "left_name",
        "left_type",
        "right_id",
        "right_name",
        "right_type",
    ]
    outputFile = args.outDir + "/" + args.csvFileName
    fullpath = os.path.abspath(outputFile)
    fCSVFile = open(outputFile, "w", newline="", encoding="utf-8")
    print("csv file initialized. " + outputFile)
    colWriter = csv.writer(fCSVFile)
    colWriter.writerow(columnHeader)



    # dump the lineage result to file (for documentation/understanding)
    jsonFile = args.outDir + "/view_column_lineage.json"
    with open(jsonFile, "w") as json_file:
        json.dump(lineageResp.json(), json_file, indent=4,)

    # iterate over the lineage results and write to a csv file
    lineage_json = lineageResp.json()
    for lineageitem in lineage_json["items"]:
        outid = lineageitem["outId"]
        inid = lineageitem["inId"]
        outname = edcutils.getFactValue(lineageitem["outEmbedded"], "core.name")
        outclass = edcutils.getFactValue(lineageitem["outEmbedded"], "core.classType")
        inname = edcutils.getFactValue(lineageitem["inEmbedded"], "core.name")
        inclass = edcutils.getFactValue(lineageitem["inEmbedded"], "core.classType")
        print(f"a link to process - {outid} {outname} {outclass} -> {inid} {inname} {inclass}")
        colWriter.writerow([outid, outname, outclass, inid, inname, inclass])

    fCSVFile.close()
コード例 #11
0
def processExternalDB(dbId, classType, dbName, resType, resName, colWriter):
    """
    dbId = the id of the external database object
    classType = the classtype of the external database oject
    dbNAme - the name of the external db object
    resType - the resource type e.g Oracle, SQLServer that created the
              external db object
    resName - the resource name that created the external db object
    colWriter - the csv File object to write any lineage entries

    process outline:-
    get lineage to the table level (2 parent/child hops) from the external database
        for each table, find the actual table that should be linked (case insensitive)
        if only 1 table was found (otherwise - print an error message)
            link the table and all columns
    """

    dbUnknown = False
    # counters for # of links created
    tabLinks = 0
    colLinks = 0
    errors = 0
    tables_with_no_links = 0

    db_mapping = dict()

    # 'External' can be used if the database name is not known (sqlserver use case)
    if dbName == "External":
        dbUnknown = True
        print("\tthe database for the externally referenced object is unknown")

    # lookup to see of the database name is in the mapping lookup table
    dblookupName = dbName
    if dbName.startswith("@"):
        dblookupName = dbName[1:]

    if dblookupName in mem.dblinks:
        db_mapping = mem.dblinks[dblookupName]
        print(
            f"\tactual name for db found in dblinks using lookup: {db_mapping}"
        )
        dbUnknown = False
    else:
        dbUnknown = True
    # note:  if the database type is oracle, we can't trust the name of the database
    #        since it will contain the dblink name, not the externally referenced db
    #        so we will treat it as unknown and just try to find the schema/table ref'd
    # note - better handled by lookup references... (above)
    # if resType == "Oracle":
    #     dbUnknown = True

    print(f"\ttype={classType} name={dbName} id={dbId} unknownDB:{dbUnknown}")

    # get the lineage for the database object
    lineageURL = edcHelper.baseUrl + "/access/2/catalog/data/relationships"
    lineageParms = {
        "seed": dbId,
        "association": "core.ParentChild",
        "depth": "2",
        "direction": "OUT",
        "includeAttribute": {"core.name", "core.classType"},
        "includeTerms": "false",
        "removeDuplicateAggregateLinks": "false",
    }
    print(f"\tLineage query for: {dbName} params={lineageParms}")
    lineageResp = edcHelper.session.get(
        lineageURL,
        params=lineageParms,
    )

    lineageStatus = lineageResp.status_code
    print(f"\tlineage rc={lineageStatus}")
    lineageJson = lineageResp.text

    #  bug in the relationships api call - the items collection sould be
    #  "items" (with quotes)
    if lineageJson.startswith("{items"):
        # bug in 10.2.1 and before
        # replace items with "items" - for just the first occurrence
        # (note:  this is fixed after 10.2.1
        lineageJson = json.loads(lineageJson.replace("items", '"items"', 1))
    else:
        # 10.2.1u1 + json is escaped properly
        lineageJson = json.loads(lineageJson)

    # for each item in the lineage resultset
    for lineageItem in lineageJson["items"]:
        inId = lineageItem.get("inId")
        assocId = lineageItem.get("associationId")
        inEmbedded = lineageItem.get("inEmbedded")

        schemaName = ""
        schema_substituted = False
        if assocId == "com.infa.ldm.relational.ExternalSchemaTable":
            # find the table...
            schemaName = inId.split("/")[-2]
            regex_pattern = re.compile(".*\[\d+\].+\[\d+\]")
            if regex_pattern.match(schemaName):
                print(
                    f"\tfound un-known schema name reference id= {schemaName}")
                if "schema" in db_mapping:
                    print(
                        f"\treplaceing schema={schemaName} with {db_mapping['schema']}"
                    )
                    schemaName = db_mapping["schema"]
                    schema_substituted = True

            inEmbedded = lineageItem.get("inEmbedded")
            tableName = edcutils.getFactValue(inEmbedded, "core.name")
            print(f"\tprocessing table={tableName} schema={schemaName}"
                  f" db={dbName} id={inId}")

            # format the query to find the actual table
            # "core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View)  and core.name_lc_exact:"
            q = (
                'core.classType:(com.infa.ldm.relational.Table or com.infa.ldm.relational.View)  and core.name:"'
                + tableName.lower() + '"')
            if dbUnknown and schemaName == "":
                q = q + " and core.resourceName:" + resName
            # if dbUnknown==False:
            #    q=q+ ' and ' + dbName
            q = q + ' and core.resourceType:"' + resType + '"'
            tableSearchParms = {"q": q, "offset": 0, "pageSize": 100}
            print("\t\tquery=" + str(tableSearchParms))
            # find the table - with name tableName
            tResp = edcHelper.session.get(
                edcHelper.baseUrl + "/access/2/catalog/data/objects",
                params=tableSearchParms,
            )
            tStatus = tResp.status_code
            print("\t\tquery rc=" + str(tStatus))
            # print("\t\t\t\t" + str(tResp.json()))
            foundTabId = ""
            tableMatchCount = 0
            # possible matching tables
            for tableItem in tResp.json()["items"]:
                fromTableId = tableItem["id"]
                fromSchemaName = fromTableId.split("/")[-2].lower()
                # foundTabId=''
                # foundTabId = tableItem["id"]
                # print("rs-" + fromTableId + " compared with " + inId);
                if fromTableId != inId and fromTableId.count("/") == 4:
                    # filter out comparing against itself +
                    # counting the / chars - 4 = normal db, 5=external db (disregard)
                    print("\t\tchecking " + fromTableId)
                    # print("could be this one...." + foundTabId + " << " + inId)
                    theName = fromTableId.split("/")[-1]

                    # the schema could be empty - in that case it should match dbo
                    # unless we have a way of knowing what the default schema is
                    if theName.lower() == tableName.lower() and (
                            schemaName.lower() == fromSchemaName.lower() or
                        (schemaName == "" and fromSchemaName == "dbo")):
                        # table and schema names match
                        # check if there is a dblink lookup that has a database & if that matches too
                        if schema_substituted:
                            fromdbName = fromTableId.split("/")[-3].lower()
                            linkedDbName = db_mapping["database"]
                            print(
                                f"\t\t\tchecking for matching database for dblink does {fromdbName} = {linkedDbName}?: {fromdbName.lower() == linkedDbName.lower()}"
                            )
                            if fromdbName.lower() == linkedDbName.lower():
                                # make the link
                                print("\t\t\tok to make the link....")
                                tableMatchCount += 1
                                foundTabId = fromTableId
                                print(
                                    f"\t\ttable name matches...{theName}=={tableName} {inId} {foundTabId} "
                                    f" count/{fromTableId.count('/')}")
                            else:
                                print(
                                    "\t\t\tdatabase name does not match - not linkable"
                                )
                        else:
                            tableMatchCount += 1
                            foundTabId = fromTableId
                            print(
                                f"\t\ttable name matches...{theName}=={tableName} {inId} {foundTabId} "
                                f" count/{fromTableId.count('/')}")
                    else:
                        print(
                            "\t\t\tno match... linked table name:" + theName +
                            " ref table:" + tableName + " ref shema:" +
                            schemaName + " from schema:" + fromSchemaName +
                            " schema names match? : " +
                            str(schemaName.lower() == fromSchemaName.lower()) +
                            " schema_substituted: " + str(schema_substituted) +
                            " db_mapping: " + str(db_mapping))
                # else:
                # print("skipping this one");

            print("\t\ttotal matching tables=" + str(tableMatchCount) +
                  " inId:" + inId)
            if tableMatchCount == 1:
                # ok - we have a single table to match
                # link at the table level - then get the columns and link them too
                # print("linking from actual tab - to external ref tab")
                print("\t\tlinking tables " + foundTabId + " -->> " + inId)
                # get the columns for the ext Table
                # (will be a reduced set - only the linked columns)
                extCols = getColumnsForTable(inId)
                tabCols = getColumnsForTable(foundTabId)

                # link the table level
                edcutils.exportLineageLink(foundTabId, inId,
                                           "core.DataSetDataFlow", colWriter)
                tabLinks += 1
                tabColsLinked = 0

                # match the columns on the left/right side
                for toCol, toId in extCols.items():
                    # check if the toCol (the name) exists in the tabCols dict
                    # print('\t\t\tchecking toCol:' + toCol + tabCols.get(toCol))
                    fromId = tabCols.get(toCol)
                    if fromId is not None:
                        # print('\t\t\tlinking columns...' + fromId + ' --->>>' + toId)
                        edcutils.exportLineageLink(fromId, toId,
                                                   "core.DirectionalDataFlow",
                                                   colWriter)
                        tabColsLinked += 1
                        colLinks += 1
                    else:
                        print(
                            f"\t\t\tError: cannot find column {toCol} in table {inId}"
                        )
                        errors += 1
                print(f"\t\t\text cols:{len(extCols)} tablCols:{len(tabCols)}"
                      f" linked={tabColsLinked}")
                # print("\t\t\tcolumns linked=" + str(tabColsLinked))
            else:
                tables_with_no_links += 1
                print(
                    f"\t\t{tableMatchCount} links found) no links will be created"
                )
            # flush the console buffer - for tailing the stdout log
            sys.stdout.flush()

    print(
        f"external database: {dbName} processed: tab/col links created: "
        f"{tabLinks}/{colLinks} errors:{errors} tables not linked={tables_with_no_links}"
    )
    mem.tables_not_linked += tables_with_no_links
    print("")
    return tabLinks, colLinks, errors
コード例 #12
0
def findAndUpdateCatalog(databaseName, schemaName, datasetName, attrDict):
    '''
    given a list of properties - find an object in the catalog - and compare the attributes found, update if necessary
    '''
    #datasetName = 'CRM_CUSTOMER_MAIN'
    print("\tready to find and update catalog object: " + datasetName +
          " attrs=" + str(attrDict))
    print("\tready to find and update catalog schema: " + schemaName)
    print("\tready to find and update catalog database: " + databaseName)

    header = {"Accept": "application/json"}
    #query = "+core.resourceType: " + resourceType + " +core.allclassTypes:com.infa.ldm.relational.Schema +core.name:" + schemaName + "+core.allclassTypes:com.infa.ldm.relational.Table +core.name:" + datasetName
    #query = "+core.allclassTypes:com.infa.ldm.relational.Table +core.name:" + datasetName
    query = "id:*/" + databaseName + "/" + schemaName + "/" + datasetName

    parameters = {'offset': "0", 'pageSize': "200", 'q': query}

    # execute catalog rest call, for a page of results
    print('\texecuting search: q=' + query)
    #resp = requests.get(catalogServer + '/access/2/catalog/data/objects', params=parameters, headers=header, auth=HTTPBasicAuth(uid,pwd), verify=sslCert)
    resp = requests.get(catalogServer + '/access/2/catalog/data/objects',
                        params=parameters,
                        headers=header,
                        auth=HTTPBasicAuth(uid, pwd))
    status = resp.status_code
    if status != 200:
        # some error - e.g. catalog not running, or bad credentials
        print("error! " + str(status) + str(resp.json()))
        return

    resultJson = resp.json()
    print('\tJSON response: q=' + str(resultJson))

    total = resultJson['metadata']['totalCount']
    #print(total)
    if total == 0:
        print("\t" + str(total) +
              " item(s) not found - catalog object will not be updated")
        return

    if total != 1:
        print("\t" + str(total) +
              " more than 1 result found, all of them will be updated")

    itemId = ""
    itemName = ""
    itemHref = ""
    # get the eTag
    eTag = resp.headers['ETag']
    print("\titem found: eTag=" + eTag)
    for foundItem in resultJson["items"]:
        itemId = foundItem["id"]
        itemName = edcutils.getFactValue(foundItem, "core.name")
        itemHref = foundItem["href"]

        # get the facts collection (to be updated for any new/upated attributes
        itemFacts = foundItem['facts']

        print("\tid=" + itemId)
        print("\tname=" + itemName)
        #         print("\tfacts=" + str(itemFacts))

        # flag to determine if an updated is necessary
        updateObject = False

        # activity counts
        newFactCount = 0
        updFactCount = 0

        for excelAttrName, excelAttrVal in attrDict.items():
            print("\t\tchecking excel attr=" + excelAttrName)
            doesAttrExist = excelAttrName in custAttrDict
            print("\t\t\tattr exists in EDC? " + str(doesAttrExist))
            if doesAttrExist:
                # check if the content matches
                edcAttrVal = edcutils.getFactValue(
                    foundItem, custAttrDict.get(excelAttrName))
                print("\t\t\t edc Attribute Value " + edcAttrVal)
                if (edcAttrVal == ''):
                    newFactCount += 1
                    print("\t\t\tnew attribute value...")
                    newFact = {
                        'attributeId': custAttrDict.get(excelAttrName),
                        'value': excelAttrVal,
                        'label': excelAttrName
                    }
                    print("\t\t\t\t\t" + str(newFact))
                    itemFacts.append(newFact)
                    updateObject = True

                else:
                    print("\t\t\texisting value='" + edcAttrVal + "'=='" +
                          excelAttrVal + "' " +
                          str(edcAttrVal == excelAttrVal))
                    #
                    # compare the values - if match, do nothing
                    # if different - update
                    if (edcAttrVal == excelAttrVal):
                        print("\t\t\tvalues match - no update needed")
                    else:
                        updFactCount += 1
                        updateObject = True
                        # update the actual value...
                        print("\t\t\treplacing: attr=" +
                              custAttrDict.get(excelAttrName) + " from=" +
                              edcAttrVal + " with " + excelAttrVal)
                        replaceExistingAttrValue(
                            foundItem, custAttrDict.get(excelAttrName),
                            excelAttrVal)
            else:
                # attribute does not exist
                print("\t\t\tattr '" + excelAttrName +
                      "' does not exist - use ldmadmin ui to create it")

        # print message if nothing to do & return
        if not updateObject:
            print("\tno updates needed for object")
            return

        # object should be updated
        print("\tobject should be updated - new facts:" + str(newFactCount) +
              " updated facts:" + str(updFactCount))

        # uncomment these 2 lines to help test concurrent updates - while the sleep is happening, manually update the object
        # this should cause a 412 error message (ifMatch <> eTag)
        #print("sleeping for 20 seconds - to simulate concurrent updates")
        #time.sleep(20)

        # prepare to write the item to the catalog
        putParms = {
            'Content-Type': 'application/json',
            'Accept': 'application/json',
            'If-Match': eTag
        }
        putUrl = catalogServer + '/access' + itemHref
        print("\tready to update item: url=" + putUrl + " parm=" +
              str(putParms))
        # make the PUT request (execute the updaate
        #r = requests.put(putUrl, data=json.dumps(foundItem), headers=putParms, auth=HTTPBasicAuth(uid,pwd), verify=sslCert)
        r = requests.put(putUrl,
                         data=json.dumps(foundItem),
                         headers=putParms,
                         auth=HTTPBasicAuth(uid, pwd))
        putStatus = r.status_code
        print("\treturn code=" + str(putStatus))
        if putStatus == 200:
            newEtag = r.headers['eTag']
            print("\t\trequest completed: status=%d new eTag %s" %
                  (putStatus, newEtag))
        else:
            print("update failed." + str(r.json()))