Python ExcelReader Examples, pyapacheatlas.readers.excel.ExcelReader Python Examples

Example #1

0

Show file

File: test_excel.py Project: AmjadMKhan/pyapacheatlas

def test_excel_bulkEntities_withClassifications():
    temp_filepath = "./temp_test_excel_bulkEntitiesWithClassifications.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    max_cols = len(ExcelReader.TEMPLATE_HEADERS["BulkEntities"])
    # "typeName", "name",
    # "qualifiedName", "classifications"
    json_rows = [[
        "demoType", "entityNameABC", "qualifiedNameofEntityNameABC", "PII"
    ],
                 [
                     "demoType", "entityNameGHI",
                     "qualifiedNameofEntityNameGHI", "PII;CLASS2"
                 ]]

    setup_workbook(temp_filepath, "BulkEntities", max_cols, json_rows)

    results = reader.parse_bulk_entities(temp_filepath)

    try:
        assert ("entities" in results)
        assert (len(results["entities"]) == 2)
        abc = results["entities"][0]
        ghi = results["entities"][1]

        assert (len(abc["classifications"]) == 1)
        assert (len(ghi["classifications"]) == 2)

        assert (abc["classifications"][0]["typeName"] == "PII")
        ghi_classification_types = set(
            [x["typeName"] for x in ghi["classifications"]])
        assert (set(["PII", "CLASS2"]) == ghi_classification_types)
    finally:
        remove_workbook(temp_filepath)

Example #2

0

Show file

File: test_excel.py Project: svchandramohan/pyapacheatlas

def test_excel_typeDefs_entityTypes():
    temp_filepath = "./temp_test_typeDefs_entityTYpes.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    max_cols = len(ExcelReader.TEMPLATE_HEADERS["BulkEntities"])
    # "Entity TypeName", "name", "description",
    # "isOptional", "isUnique", "defaultValue",
    # "typeName", "displayName", "valuesMinCount",
    # "valuesMaxCount", "cardinality", "includeInNotification",
    # "indexType", "isIndexable"
    json_rows = [
        ["demoType", "attrib1", "Some desc",
         "True", "False", None,
         "string", None, None,
         None, None, None,
         None, None
         ]
    ]
    setup_workbook(temp_filepath, "EntityDefs", max_cols, json_rows)

    results = reader.parse_entity_defs(temp_filepath)

    assert("entityDefs" in results)
    assert(len(results["entityDefs"]) == 1)
    assert(results["entityDefs"][0]["attributeDefs"][0]["name"] == "attrib1")

    remove_workbook(temp_filepath)

Example #3

0

Show file

File: test_excel.py Project: svchandramohan/pyapacheatlas

def test_excel_table_lineage():
    temp_filepath = "./temp_test_excel_table_lineage.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    max_cols = len(ExcelReader.TEMPLATE_HEADERS["TablesLineage"])

    # "Target Table", "Target Type", "Target Classifications",
    # "Source Table", "Source Type", "Source Classifications",
    # "Process Name", "Process Type"

    json_rows = [
        ["table1", "demo_type", None,
         "table0", "demo_type2", None,
         "proc01", "proc_type"
         ]
    ]

    setup_workbook(temp_filepath, "TablesLineage", max_cols, json_rows)

    results = reader.parse_table_lineage(temp_filepath)

    try:
        assert(results[0].to_json(minimum=True) == {
            "typeName": "demo_type", "guid": -1001, "qualifiedName": "table1"})
        assert(results[1].to_json(minimum=True) == {
            "typeName": "demo_type2", "guid": -1002, "qualifiedName": "table0"})
        assert(results[2].to_json(minimum=True) == {
            "typeName": "proc_type", "guid": -1003, "qualifiedName": "proc01"})
    finally:
        remove_workbook(temp_filepath)

Example #4

0

Show file

File: test_excel.py Project: wjohnson/pyapacheatlas

def test_excel_classification_defs():
    temp_filepath = "./temp_test_excel_classificationDefs.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)

    headers = ExcelReader.TEMPLATE_HEADERS["ClassificationDefs"]

    # Same as main test
    json_rows = [
        [
            "testClassification", None, "This is my classification"
            "testClassification2", "test;test2", "This is my classification2"
        ]
    ]

    setup_workbook_custom_sheet(
        temp_filepath, "ClassificationDefs", headers, json_rows)

    results = reader.parse_classification_defs(temp_filepath)

    try:
        assert(len(results) == 1)
        assert("classificationDefs" in results)
        assert(len(results["classificationDefs"]) == 1)
    finally:
        remove_workbook(temp_filepath)

Example #5

0

Show file

File: test_excel.py Project: svchandramohan/pyapacheatlas

def setup_workbook(filepath, sheet_name, max_col, json_rows):
    if not os.path.exists(filepath):
        ExcelReader.make_template(filepath)
    wb = load_workbook(filepath)
    active_sheet = wb[sheet_name]

    row_counter = 0
    # TODO: Make the max_col more dynamic
    for row in active_sheet.iter_rows(min_row=2, max_col=max_col, max_row=len(json_rows)+1):
        for idx, cell in enumerate(row):
            cell.value = json_rows[row_counter][idx]
        row_counter += 1

    wb.save(filepath)

Example #6

0

Show file

File: test_excel.py Project: svchandramohan/pyapacheatlas

def setup_workbook_custom_sheet(filepath, sheet_name, headers, json_rows):
    wb = Workbook()
    customSheet = wb.active
    customSheet.title = sheet_name
    ExcelReader._update_sheet_headers(headers, customSheet)
    row_counter = 0
    # TODO: Clear the column headers
    # Add the data to the sheet
    for row in customSheet.iter_rows(min_row=2, max_col=len(headers), max_row=len(json_rows)+1):
        for idx, cell in enumerate(row):
            cell.value = json_rows[row_counter][idx]
        row_counter += 1

    wb.save(filepath)
    wb.close()

Example #7

0

Show file

File: test_excel.py Project: wjohnson/pyapacheatlas

def test_excel_bulkEntities_meanings_relationships():
    temp_filepath = "./temp_test_excel_bulkEntitieswithMeanings.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)

    headers = ExcelReader.TEMPLATE_HEADERS["BulkEntities"] + \
        ["[Relationship] meanings"]
    # "typeName", "name",
    # "qualifiedName", "classifications"
    # "[Relationship] meanings"
    json_rows = [
        ["demoType", "entityNameABC",
         "qualifiedNameofEntityNameABC", None,
         None
         ],
        ["demoType", "entityNameGHI",
         "qualifiedNameofEntityNameGHI", None,
         "termA"
         ],
         ["demoType", "entityNameXYZ",
         "qualifiedNameofEntityNameXYZ", None,
         "term1;term2"
         ]
    ]

    setup_workbook_custom_sheet(
        temp_filepath, "BulkEntities", headers, json_rows)

    results = reader.parse_bulk_entities(temp_filepath)

    try:
        abc = results["entities"][0]
        ghi = results["entities"][1]
        xyz = results["entities"][2]

        assert("meanings" not in abc["relationshipAttributes"])
        assert("meanings" in ghi["relationshipAttributes"])
        ghi_terms = ghi["relationshipAttributes"]["meanings"]
        xyz_terms = xyz["relationshipAttributes"]["meanings"]

        assert(len(ghi_terms) == 1)
        assert(len(xyz_terms) == 2)

    finally:
        remove_workbook(temp_filepath)

Example #8

0

Show file

File: test_excel.py Project: AmjadMKhan/pyapacheatlas

def test_verify_template_sheets():
    # Setup
    temp_path = "./temp_verfiysheets.xlsx"
    ExcelReader.make_template(temp_path)

    # Expected
    expected_sheets = set([
        "ColumnsLineage", "TablesLineage", "EntityDefs", "BulkEntities",
        "UpdateLineage"
    ])

    wb = load_workbook(temp_path)
    difference = set(wb.sheetnames).symmetric_difference(expected_sheets)
    try:
        assert (len(difference) == 0)
    finally:
        wb.close()
        os.remove(temp_path)

Example #9

0

Show file

File: test_excel.py Project: AmjadMKhan/pyapacheatlas

def test_excel_bulkEntities():
    temp_filepath = "./temp_test_excel_bulkEntities.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    max_cols = len(ExcelReader.TEMPLATE_HEADERS["BulkEntities"])
    # "typeName", "name",
    # "qualifiedName", "classifications"
    json_rows = [[
        "demoType", "entityNameABC", "qualifiedNameofEntityNameABC", None
    ], ["demoType", "entityNameGHI", "qualifiedNameofEntityNameGHI", None]]
    setup_workbook(temp_filepath, "BulkEntities", max_cols, json_rows)

    results = reader.parse_bulk_entities(temp_filepath)

    try:
        assert ("entities" in results)
        assert (len(results["entities"]) == 2)
    finally:
        remove_workbook(temp_filepath)

Example #10

0

Show file

File: test_excel.py Project: svchandramohan/pyapacheatlas

def test_excel_bulkEntities_dynamicAttributes():
    temp_filepath = "./temp_test_excel_bulkEntitieswithAttributes.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)

    headers = ExcelReader.TEMPLATE_HEADERS["BulkEntities"] + \
        ["attrib1", "attrib2"]
    # "typeName", "name",
    # "qualifiedName", "classifications"
    # "attrib1", "attrib2"
    json_rows = [
        ["demoType", "entityNameABC",
         "qualifiedNameofEntityNameABC", None,
         None, "abc"
         ],
        ["demoType", "entityNameGHI",
         "qualifiedNameofEntityNameGHI", None,
         "ghi", "abc2"
         ]
    ]

    setup_workbook_custom_sheet(
        temp_filepath, "BulkEntities", headers, json_rows)

    results = reader.parse_bulk_entities(temp_filepath)

    try:
        assert("entities" in results)
        assert(len(results["entities"]) == 2)
        abc = results["entities"][0]
        ghi = results["entities"][1]

        assert("attrib1" not in abc["attributes"])
        assert("attrib2" in abc["attributes"])
        assert(abc["attributes"]["attrib2"] == "abc")

        assert("attrib1" in ghi["attributes"])
        assert("attrib2" in ghi["attributes"])
        assert(ghi["attributes"]["attrib2"] == "abc2")
        assert(ghi["attributes"]["attrib1"] == "ghi")

    finally:
        remove_workbook(temp_filepath)

Example #11

0

Show file

File: test_excel.py Project: wjohnson/pyapacheatlas

def test_excel_typeDefs_entityTypes_superTypes():
    temp_filepath = "./temp_test_typeDefs_entityTypesWithSuperTypes.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    headers = ExcelReader.TEMPLATE_HEADERS["EntityDefs"] + ['Entity superTypes']
    # "Entity TypeName", "name", "description",
    # "isOptional", "isUnique", "defaultValue",
    # "typeName", "displayName", "valuesMinCount",
    # "valuesMaxCount", "cardinality", "includeInNotification",
    # "indexType", "isIndexable"
    json_rows = [
        ["demoType", "attrib1", "Some desc",
         "True", "False", None,
         "string", None, None,
         None, None, None,
         None, None, "DataSet;Blah"
         ],
         ["demoType", "attrib2", "Some desc",
         "True", "False", None,
         "string", None, None,
         None, None, None,
         None, None, "Asset"
         ],
         ["demoType", "attrib3", "Some desc",
         "True", "False", None,
         "string", None, None,
         None, None, None,
         None, None, None
         ]
    ]

    setup_workbook_custom_sheet(
        temp_filepath, "EntityDefs", headers, json_rows)

    results = reader.parse_entity_defs(temp_filepath)

    assert("entityDefs" in results)
    assert(len(results["entityDefs"]) == 1)
    superTypes = results["entityDefs"][0]["superTypes"]
    assert( len(superTypes) == 3 )
    assert( set(superTypes) == set(["DataSet", "Blah", "Asset"]) )

    remove_workbook(temp_filepath)

Example #12

0

Show file

File: test_excel.py Project: AmjadMKhan/pyapacheatlas

def test_excel_update_lineage():
    temp_filepath = "./temp_test_excel_updateLineage.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)

    headers = ExcelReader.TEMPLATE_HEADERS["UpdateLineage"]

    # Same as main test
    json_rows = [[
        "demo_table", "demotarget", "demo_table2", "demosource", "proc01",
        "procqual01", "Process2"
    ]]

    setup_workbook_custom_sheet(temp_filepath, "UpdateLineage", headers,
                                json_rows)

    results = reader.parse_update_lineage(temp_filepath)

    try:
        assert (len(results) == 1)
    finally:
        remove_workbook(temp_filepath)

Example #13

0

Show file

File: test_excel.py Project: AmjadMKhan/pyapacheatlas

def test_excel_column_lineage():
    temp_filepath = "./temp_test_excel_column_lineage.xlsx"
    ec = ExcelConfiguration()
    reader = ExcelReader(ec)
    max_cols_tl = len(ExcelReader.TEMPLATE_HEADERS["TablesLineage"])
    max_cols_cl = len(ExcelReader.TEMPLATE_HEADERS["ColumnsLineage"])

    # "Target Table", "Target Type", "Target Classifications",
    # "Source Table", "Source Type", "Source Classifications",
    # "Process Name", "Process Type"

    json_rows = [[
        "table1", "demo_table", None, "table0", "demo_table", None, "proc01",
        "demo_process"
    ]]

    # "Target Table", "Target Column", "Target Classifications",
    # "Source Table", "Source Column", "Source Classifications",
    # "Transformation"
    json_rows_col = [
        ["table1", "t00", None, "table0", "t00", None, None],
        ["table1", "tcombo", None, "table0", "tA", None, None],
        ["table1", "tcombo", None, "table0", "tB", None, None],
    ]

    setup_workbook(temp_filepath, "TablesLineage", max_cols_tl, json_rows)
    setup_workbook(temp_filepath, "ColumnsLineage", max_cols_cl, json_rows_col)

    atlas_types = column_lineage_scaffold("demo")

    table_entities = reader.parse_table_lineage(temp_filepath)

    # For column mappings, table_entities do not contain columnMapping
    assert (all(["columnMapping" not in e.attributes for e in table_entities]))

    column_entities = reader.parse_column_lineage(temp_filepath,
                                                  table_entities,
                                                  atlas_types,
                                                  use_column_mapping=True)

    try:
        table1 = None
        table0 = None
        proc01 = None
        t00 = None
        table1_t00 = None
        table0_t00 = None
        col_lineage_process = None
        table_lookup = {e.get_name(): e for e in table_entities}
        column_lookup = {e.get_name(): e for e in column_entities}

        # We have five columns (t00 > t00) + ((tA + tB) > tcombo)
        # and two processes
        assert (len(column_entities) == 7)

        # Because of column mappings is TRUE, table entities are modified
        assert ("columnMapping" in table_lookup["proc01"].attributes)
        resulting_col_map = json.loads(
            table_lookup["proc01"].attributes["columnMapping"])[0]
        expected_col_map = {
            "DatasetMapping": {
                "Source": "table0",
                "Sink": "table1"
            },
            "ColumnMapping": [{
                "Source": "t00",
                "Sink": "t00"
            }, {
                "Source": "tA",
                "Sink": "tcombo"
            }, {
                "Source": "tB",
                "Sink": "tcombo"
            }]
        }
        assert (resulting_col_map["DatasetMapping"] ==
                expected_col_map["DatasetMapping"])
        assert (len(resulting_col_map["ColumnMapping"]) == 3)
        assert (resulting_col_map["ColumnMapping"][0]
                in expected_col_map["ColumnMapping"])
        assert (resulting_col_map["ColumnMapping"][1]
                in expected_col_map["ColumnMapping"])
        assert (resulting_col_map["ColumnMapping"][2]
                in expected_col_map["ColumnMapping"])
    finally:
        remove_workbook(temp_filepath)