Esempio n. 1
0
def restructure(tables: typing.Iterable[Table],
                prefix_header_rules=(),
                max_cols=100) -> typing.Iterable[Table]:
    """Restructure tables.

    Performs all sorts of heuristic cleaning operations, including:

        - Remove empty columns (:meth:`takco.reshape.clean.remove_empty_columns`)
        - Deduplicate header rows (:meth:`takco.reshape.clean.deduplicate_header_rows`)
        - Remove empty header rows (:meth:`takco.reshape.clean.remove_empty_header_rows`)
        - Process rowspanning head cells (:meth:`takco.reshape.clean.process_rowspanning_head_cells`)
        - Restack horizontal schema repeats (:meth:`takco.reshape.clean.restack_horizontal_schema_repeats`)
        - Remove empty rows (:meth:`takco.reshape.clean.remove_empty_rows`)
        - Process rowspanning body cells (:meth:`takco.reshape.clean.process_rowspanning_body_cells`)

    """

    for table in tables:
        try:
            table = Table(table).to_dict()

            if table.get("numCols", 0) >= max_cols:
                continue

            if any("tdHtmlString" in c for r in table.get("tableHeaders")
                   for c in r):
                hs = table.get("tableHeaders", [])
                if all(
                        c.get("tdHtmlString", "")[:3] == "<td" for r in hs
                        for c in r):
                    table["tableData"] = hs + table.get("tableData", [])
                    table["tableHeaders"] = []

            init_captions(table)

            # Analyze headers & data together
            deduplicate_header_rows(table)

            # Analyze header
            remove_empty_header_rows(table)
            process_rowspanning_head_cells(table)
            restack_horizontal_schema_repeats(table)
            table["tableHeaders"] = [h for h in table["tableHeaders"] if h]

            # Analyze body
            remove_empty_rows(table)
            process_rowspanning_body_cells(table)
            heuristic_transpose(table)
            remove_empty_columns(table)

            apply_prefix_header_rules(table, prefix_header_rules)

            if table["tableData"]:
                yield Table(table)
        except Exception as e:
            log.error(e)
Esempio n. 2
0
def integrate(tables: List[dict], db: NaryDB, pfd_threshold=0.9):
    """Integrate tables with n-ary relations

    Args:
        tables: Tables to link
        kbdir: KB directory (TODO config)
        pfd_threshold: Probabilistic Functional Dependency Threshold
    """
    assert isinstance(db, NaryDB)
    with db:

        for table in tables:
            table = Table(table)

            log.debug(
                "Integrating table %s (%d rows)",
                table.get("_id"),
                table["numDataRows"],
            )

            # Find key column
            profiler = PFDProfiler()
            ci_literal = {
                int(ci): any(SimpleTyper().is_literal_type(t) for t in ts)
                for ci, ts in table.annotations.get("classes", {}).items()
            }
            usecols = [ci for ci in range(table["numCols"]) if not ci_literal.get(ci)]
            rows = [[c.get("text") for c in row] for row in table.get("tableData", [])]
            keycol = profiler.get_keycol(rows, usecols)
            table["keycol"] = keycol
            log.debug(f"Got keycol {keycol}")

            ents = table.get("entities", {})
            row_entsets = [
                [
                    set(URIRef(s) for s in ents.get(str(ci), {}).get(str(ri), {}) if s)
                    for ci, _ in enumerate(row)
                ]
                for ri, row in enumerate(rows)
            ]
            tocol_fromcolprop = db.integrate(rows, row_entsets)
            log.debug(f"Got tocol_fromcolprop {tocol_fromcolprop}")
            properties = {}
            for tocol, fromcolprop in tocol_fromcolprop.items():
                for fromcol, prop in fromcolprop.items():
                    properties.setdefault(str(fromcol), {}).setdefault(str(tocol), prop)
            table.annotations["properties"] = properties

            yield table
Esempio n. 3
0
def tables_add_context_rows(tables, fields=()):
    """Add context to table depending on table dict fields"""
    for table in tables:
        table = Table(table).to_dict()

        for field in list(fields)[::-1]:
            empty_header = {
                "text": f"_{field}",
                "surfaceLinks": [],
            }
            table["tableHeaders"] = [[empty_header] + list(hrow)
                                     for hrow in table["tableHeaders"]]
            tableHeaders = table["tableHeaders"]
            headerText = tuple(
                tuple([cell.get("text", "").lower() for cell in r])
                for r in tableHeaders)
            table["headerId"] = Table.get_headerId(headerText)

            fieldtext = table.get(field, "")
            context_cells = [{
                "text":
                fieldtext,
                "surfaceLinks": [{
                    "offset": 0,
                    "endOffset": len(fieldtext),
                    "linkType": "INTERNAL",
                    "target": {
                        "href": fieldtext
                    },
                }],
            }]

            table["tableData"] = [
                copy.deepcopy(context_cells) + list(drow)
                for drow in table["tableData"]
            ]
            table["numCols"] = len(
                table["tableData"][0]) if table["tableData"] else 0

        n = len(fields)
        if "entities" in table:
            table["entities"] = {
                str(int(ci) + n): x
                for ci, x in table["entities"].items()
            }
        if "classes" in table:
            table["classes"] = {
                str(int(ci) + n): x
                for ci, x in table["classes"].items()
            }
        if "properties" in table:
            table["properties"] = {
                str(int(fci) + n):
                {str(int(tci) + n): e
                 for tci, e in te.items()}
                for fci, te in table["properties"].items()
            }

        yield Table(table)
Esempio n. 4
0
def split_compound_columns(tables, splitter):
    """Detect and split compound columns"""

    log.info(f"Splitting compound columns using {splitter}")

    with splitter:
        for table in tables:
            table = Table(table).to_dict()

            newcols = []
            headcols = list(zip(*table.get("tableHeaders", [])))
            datacols = list(zip(*table.get("tableData", [])))

            for ci, (hcol, dcol) in enumerate(zip(headcols, datacols)):
                splits = list(splitter.find_splits(dcol))
                if splits:
                    log.debug(
                        f"Found {len(splits)} splits in column {ci} of {table.get('_id')}: {list(zip(*splits))[:2]}"
                    )
                    for part, _, newcol in splits:
                        newhcol = list(hcol)
                        if newhcol:
                            newhcol[-1] = dict(newhcol[-1])
                            part = part or ""
                            newhcol[-1]["text"] = (
                                newhcol[-1].get("text", "") + " " + part
                            )
                        newcols.append((newhcol, newcol))
                else:
                    newcols.append((hcol, dcol))

            if newcols:
                headcols, datacols = zip(*newcols)
                table["tableHeaders"] = list(zip(*headcols))
                table["tableData"] = list(zip(*datacols))

            yield Table(table)
Esempio n. 5
0
def lookup_hyperlinks(tables: List[dict], lookup: Lookup, lookup_cells=False):
    """Lookup the (Wikipedia) hyperlinks inside cells for entity links

    Args:
        tables: Tables to link
        lookup_config: Configuration hash for a :mod:`takco.link.base.Lookup`
            object
    """
    assert isinstance(lookup, Lookup)
    with lookup as look:
        for table in tables:
            table = Table(table)

            log.debug(f"Looking up hyperlinks of {table.get('_id')} using {look}")
            hrefs = get_hrefs(table.get("tableData", []), lookup_cells=lookup_cells)
            ents = table.annotations.setdefault("entities", {})
            for ci, ri_ents in look.lookup_cells(hrefs).items():
                for ri, es in ri_ents.items():
                    ents.setdefault(ci, {}).setdefault(ri, {}).update(es)
            yield table

            look.flush()
Esempio n. 6
0
def link(
    tables: List[dict], linker: Linker, usecols: Union[str, List[int]] = None,
):
    """Link table entities to KB

    Args:
        tables: Tables to link
        linker_config: Entity Linker config
        usecols: Columns to use (table attribute name or list of column indexes)
    """
    assert isinstance(linker, Linker)
    with linker:
        for table in tables:
            table = Table(table)
            rows = table.body

            if not rows:
                log.debug(f"No rows in table {table.get('_id')}")

            # Restrict columns to link (e.g. 'keycol', or 'entcols')
            nopunct = str.maketrans("", "", string.punctuation + " ")

            def isnum(x):
                x = x.translate(nopunct)
                return sum(map(str.isnumeric, x)) / len(x) > 0.5 if x else False

            def numscore(col):
                return sum(int(isnum(c)) for c in col) / len(col)

            def uniqscore(col):
                return len(set(col)) / len(col)

            table["non_numeric_cols"] = [
                i for i, c in enumerate(zip(*rows)) if not numscore(c) > 0.5
            ]

            def heur(col):
                return (numscore(col) < 0.5) and (uniqscore(col) > 0.9)

            heuristic_keys = [i for i, c in enumerate(zip(*rows)) if heur(c)]
            table["heuristic_key"] = heuristic_keys[0] if heuristic_keys else []

            table_usecols = table.get(str(usecols)) or table["non_numeric_cols"]
            if type(table_usecols) != list:
                table_usecols = [table_usecols]
            if not all(type(c) == int for c in table_usecols):
                log.debug(
                    f"Skipping table {table.get('_id')}, usecols = {table_usecols}"
                )
                continue

            if table_usecols:
                log.debug(
                    f"Linking columns {table_usecols} of table {table.get('_id')}"
                )
            else:
                log.debug(f"Linking table {table.get('_id')}")

            links = linker.link(rows, usecols=table_usecols, existing=table)
            table.annotations.update(links)
            yield table

            linker.flush()