Ejemplo n.º 1
0
def tables_add_context_rows(tables, fields=()):
    """Add context to table depending on table dict fields"""
    for table in tables:
        table = Table(table).to_dict()

        for field in list(fields)[::-1]:
            empty_header = {
                "text": f"_{field}",
                "surfaceLinks": [],
            }
            table["tableHeaders"] = [[empty_header] + list(hrow)
                                     for hrow in table["tableHeaders"]]
            tableHeaders = table["tableHeaders"]
            headerText = tuple(
                tuple([cell.get("text", "").lower() for cell in r])
                for r in tableHeaders)
            table["headerId"] = Table.get_headerId(headerText)

            fieldtext = table.get(field, "")
            context_cells = [{
                "text":
                fieldtext,
                "surfaceLinks": [{
                    "offset": 0,
                    "endOffset": len(fieldtext),
                    "linkType": "INTERNAL",
                    "target": {
                        "href": fieldtext
                    },
                }],
            }]

            table["tableData"] = [
                copy.deepcopy(context_cells) + list(drow)
                for drow in table["tableData"]
            ]
            table["numCols"] = len(
                table["tableData"][0]) if table["tableData"] else 0

        n = len(fields)
        if "entities" in table:
            table["entities"] = {
                str(int(ci) + n): x
                for ci, x in table["entities"].items()
            }
        if "classes" in table:
            table["classes"] = {
                str(int(ci) + n): x
                for ci, x in table["classes"].items()
            }
        if "properties" in table:
            table["properties"] = {
                str(int(fci) + n):
                {str(int(tci) + n): e
                 for tci, e in te.items()}
                for fci, te in table["properties"].items()
            }

        yield Table(table)
Ejemplo n.º 2
0
def yield_pivots(headertexts: Iterable[Collection[Collection[str]]],
                 heuristics: List[PivotFinder]):
    """Detect headers that should be unpivoted using heuristics."""
    import copy

    with contextlib.ExitStack() as hstack:
        heuristics = [
            hstack.enter_context(copy.deepcopy(h)) for h in heuristics
        ]
        named_heuristics = {h.name: h for h in heuristics}
        for headertext in headertexts:
            if headertext:
                pivot = find_longest_pivot(headertext, heuristics)
                if pivot is not None:
                    try:
                        dummy = [[
                            str(ci)
                            for ci in range(len(next(iter(headertext))))
                        ]]
                        heuristic = named_heuristics[pivot.source]
                        heuristic.unpivot(headertext, dummy, pivot)
                        yield Table.get_headerId(headertext), pivot
                    except Exception as e:
                        log.debug(
                            f"Failed to unpivot header {headertext} with {pivot.source} due to {e}"
                        )
Ejemplo n.º 3
0
def restructure(tables: typing.Iterable[Table],
                prefix_header_rules=(),
                max_cols=100) -> typing.Iterable[Table]:
    """Restructure tables.

    Performs all sorts of heuristic cleaning operations, including:

        - Remove empty columns (:meth:`takco.reshape.clean.remove_empty_columns`)
        - Deduplicate header rows (:meth:`takco.reshape.clean.deduplicate_header_rows`)
        - Remove empty header rows (:meth:`takco.reshape.clean.remove_empty_header_rows`)
        - Process rowspanning head cells (:meth:`takco.reshape.clean.process_rowspanning_head_cells`)
        - Restack horizontal schema repeats (:meth:`takco.reshape.clean.restack_horizontal_schema_repeats`)
        - Remove empty rows (:meth:`takco.reshape.clean.remove_empty_rows`)
        - Process rowspanning body cells (:meth:`takco.reshape.clean.process_rowspanning_body_cells`)

    """

    for table in tables:
        try:
            table = Table(table).to_dict()

            if table.get("numCols", 0) >= max_cols:
                continue

            if any("tdHtmlString" in c for r in table.get("tableHeaders")
                   for c in r):
                hs = table.get("tableHeaders", [])
                if all(
                        c.get("tdHtmlString", "")[:3] == "<td" for r in hs
                        for c in r):
                    table["tableData"] = hs + table.get("tableData", [])
                    table["tableHeaders"] = []

            init_captions(table)

            # Analyze headers & data together
            deduplicate_header_rows(table)

            # Analyze header
            remove_empty_header_rows(table)
            process_rowspanning_head_cells(table)
            restack_horizontal_schema_repeats(table)
            table["tableHeaders"] = [h for h in table["tableHeaders"] if h]

            # Analyze body
            remove_empty_rows(table)
            process_rowspanning_body_cells(table)
            heuristic_transpose(table)
            remove_empty_columns(table)

            apply_prefix_header_rules(table, prefix_header_rules)

            if table["tableData"]:
                yield Table(table)
        except Exception as e:
            log.error(e)
Ejemplo n.º 4
0
def lookup_hyperlinks(tables: List[dict], lookup: Lookup, lookup_cells=False):
    """Lookup the (Wikipedia) hyperlinks inside cells for entity links

    Args:
        tables: Tables to link
        lookup_config: Configuration hash for a :mod:`takco.link.base.Lookup`
            object
    """
    assert isinstance(lookup, Lookup)
    with lookup as look:
        for table in tables:
            table = Table(table)

            log.debug(f"Looking up hyperlinks of {table.get('_id')} using {look}")
            hrefs = get_hrefs(table.get("tableData", []), lookup_cells=lookup_cells)
            ents = table.annotations.setdefault("entities", {})
            for ci, ri_ents in look.lookup_cells(hrefs).items():
                for ri, es in ri_ents.items():
                    ents.setdefault(ci, {}).setdefault(ri, {}).update(es)
            yield table

            look.flush()
Ejemplo n.º 5
0
def split_compound_columns(tables, splitter):
    """Detect and split compound columns"""

    log.info(f"Splitting compound columns using {splitter}")

    with splitter:
        for table in tables:
            table = Table(table).to_dict()

            newcols = []
            headcols = list(zip(*table.get("tableHeaders", [])))
            datacols = list(zip(*table.get("tableData", [])))

            for ci, (hcol, dcol) in enumerate(zip(headcols, datacols)):
                splits = list(splitter.find_splits(dcol))
                if splits:
                    log.debug(
                        f"Found {len(splits)} splits in column {ci} of {table.get('_id')}: {list(zip(*splits))[:2]}"
                    )
                    for part, _, newcol in splits:
                        newhcol = list(hcol)
                        if newhcol:
                            newhcol[-1] = dict(newhcol[-1])
                            part = part or ""
                            newhcol[-1]["text"] = (
                                newhcol[-1].get("text", "") + " " + part
                            )
                        newcols.append((newhcol, newcol))
                else:
                    newcols.append((hcol, dcol))

            if newcols:
                headcols, datacols = zip(*newcols)
                table["tableHeaders"] = list(zip(*headcols))
                table["tableData"] = list(zip(*datacols))

            yield Table(table)
Ejemplo n.º 6
0
def node_extract_tables(table_node):
    extractor = Extractor(table_node, transformer=lambda x: x)
    extractor.parse()
    all_htmlrows = [[clean_wikihtml(cell) for cell in row]
                    for row in extractor.return_list()]

    all_htmlrows = hack_annoying_layouts(all_htmlrows)

    for htmlrows in vertically_split_tables_on_subheaders(all_htmlrows):
        tableId += 1

        numCols = max((len(row) for row in htmlrows), default=0)
        td = BeautifulSoup("<td></td>", "html.parser")
        th = BeautifulSoup("<th></th>", "html.parser")

        tableHeaders = []
        tableData = []
        for row in htmlrows:
            h, e = ((tableHeaders, th) if all(c.name == "th" for c in row) else
                    (tableData, td))
            row = [(row[i] if i < len(row) else e) for i in range(numCols)]
            h.append([
                Extractor.get_cell_dict(cell, surface_pattern, surface_links)
                for cell in row
            ])

        if tableData:
            numDataRows = len(tableData)
            numHeaderRows = len(tableHeaders)
            log.debug(f"Extracted table {tableId} from {pgTitle}")
            yield Table(
                dict(
                    _id=f"{pgId}#{tableId}",
                    pgId=pgId,
                    pgTitle=pgTitle,
                    tableId=tableId,
                    aboutURI=aboutURI,
                    sectionTitle=sectionTitle,
                    tableCaption=tableCaption,
                    numCols=numCols,
                    numDataRows=numDataRows,
                    numHeaderRows=numHeaderRows,
                    tableData=tableData,
                    tableHeaders=tableHeaders,
                    originalHTML=str(table),
                ))
Ejemplo n.º 7
0
    def convert(docs):
        for doc in docs:
            if 'table' in doc:
                if 'fname' in doc:
                    doc['table']['fname'] = doc['fname']
                doc = doc['table']

            if doc.get("headerPosition") == "FIRST_ROW":
                header, *body = zip(*doc.pop("relation"))
                if "url" in doc:
                    doc['domain'] = urllib.parse.urlparse(doc["url"]).netloc

                if 'fname' in doc:
                    _id = doc['fname']
                else:
                    _id = "wdc-" + str(abs(hash(str(doc))))

                yield Table(
                    {
                        "_id":
                        _id,
                        "tbNr":
                        doc.get("tableNum", 0),
                        "pgId":
                        doc.get("url", ""),
                        "pgTitle":
                        doc.get("pageTitle", "").strip() or doc.get("url", ""),
                        "tableCaption":
                        doc.get("title", "").strip(),
                        "tableHeaders": [[{
                            "text": c
                        } for c in header]],
                        "tableData": [[{
                            "text": c
                        } for c in row] for row in body],
                        "numHeaderRows":
                        1,
                        "numCols":
                        len(header),
                        "numDataRows":
                        len(body),
                        **doc,
                    },
                    linked=False)
Ejemplo n.º 8
0
def try_unpivot(table, pivot, named_heuristics):
    try:
        pivotmeta = {
            "headerId": table.headerId,
            "level": pivot.level,
            "colfrom": pivot.colfrom,
            "colto": pivot.colto,
            "heuristic": pivot.source,
            "headers": pivot.headers,
        }
        heuristic = named_heuristics[pivot.source]
        head, body = heuristic.unpivot(table.head, table.body, pivot)
        return Table(head=head,
                     body=body,
                     provenance={
                         **table.provenance, "pivot": pivotmeta
                     })
    except Exception as e:
        log.debug(f"Cannot pivot table {table.get('_id')} due to {e}")
Ejemplo n.º 9
0
def integrate(tables: List[dict], db: NaryDB, pfd_threshold=0.9):
    """Integrate tables with n-ary relations

    Args:
        tables: Tables to link
        kbdir: KB directory (TODO config)
        pfd_threshold: Probabilistic Functional Dependency Threshold
    """
    assert isinstance(db, NaryDB)
    with db:

        for table in tables:
            table = Table(table)

            log.debug(
                "Integrating table %s (%d rows)",
                table.get("_id"),
                table["numDataRows"],
            )

            # Find key column
            profiler = PFDProfiler()
            ci_literal = {
                int(ci): any(SimpleTyper().is_literal_type(t) for t in ts)
                for ci, ts in table.annotations.get("classes", {}).items()
            }
            usecols = [ci for ci in range(table["numCols"]) if not ci_literal.get(ci)]
            rows = [[c.get("text") for c in row] for row in table.get("tableData", [])]
            keycol = profiler.get_keycol(rows, usecols)
            table["keycol"] = keycol
            log.debug(f"Got keycol {keycol}")

            ents = table.get("entities", {})
            row_entsets = [
                [
                    set(URIRef(s) for s in ents.get(str(ci), {}).get(str(ri), {}) if s)
                    for ci, _ in enumerate(row)
                ]
                for ri, row in enumerate(rows)
            ]
            tocol_fromcolprop = db.integrate(rows, row_entsets)
            log.debug(f"Got tocol_fromcolprop {tocol_fromcolprop}")
            properties = {}
            for tocol, fromcolprop in tocol_fromcolprop.items():
                for fromcol, prop in fromcolprop.items():
                    properties.setdefault(str(fromcol), {}).setdefault(str(tocol), prop)
            table.annotations["properties"] = properties

            yield table
Ejemplo n.º 10
0
def unpivot_tables(
    tables: Iterable[Dict],
    headerId_pivot: Optional[Dict[int, Pivot]],
    heuristics: List[PivotFinder],
):
    """Unpivot tables."""
    tablelist = [Table(t) for t in tables]

    if headerId_pivot is None:
        headertexts = [table.head for table in tablelist]
        headerId_pivot = dict(yield_pivots(headertexts, heuristics=heuristics))
    log.debug(f"Using {len(headerId_pivot)} detected pivots")

    named_heuristics = {h.name: h for h in heuristics}
    for table in tablelist:
        pivot = headerId_pivot.get(table.headerId)
        if pivot and table.head:
            table = try_unpivot(table, pivot, named_heuristics)

        if table is not None:
            yield table
Ejemplo n.º 11
0
 def get_unannotated_tables(self) -> typing.Sequence[Table]:
     for table in self.tables:
         table = dict(table)
         rows = [[{
             "text": c
         } for c in row] for row in table.pop("rows", [])]
         headers = [[{
             "text": c
         } for c in row] for row in table.pop("headers", [])]
         yield Table(
             obj={
                 "_id": table.pop("name", ""),
                 "tableData": rows,
                 "tableHeaders": headers,
                 "keycol": table.pop("keycol", None),
                 "gold": {
                     task: table.pop(task, {})
                     for task in ["entities", "classes", "properties"]
                 },
                 **table,
             })
Ejemplo n.º 12
0
def coltypes(tables: List[dict], typer: Typer):
    """Detect column types
    
    Args:
        tables: Tables to link
    """
    assert isinstance(typer, Typer)
    with typer:
        for table in tables:
            table = Table(table)

            # Find column types
            ci_classes = table.annotations.setdefault("classes", {})
            for ci, cell_ents in enumerate(get_col_cell_ents(table)):
                cell_ents = list(dict(cell_ents).items())

                cls_score = typer.coltype(cell_ents)

                ci_classes.setdefault(str(ci), {}).update(cls_score)

            yield table
Ejemplo n.º 13
0
 def get_annotated_tables(self) -> typing.Sequence[Table]:
     return Table({table["name"]: table for table in self.tables})
Ejemplo n.º 14
0
def combine_by_first_header(table1, table2):
    return Table(table1).append(Table(table2))
Ejemplo n.º 15
0
def table_get_headerId(table):
    return Table(table).headerId
Ejemplo n.º 16
0
def link(
    tables: List[dict], linker: Linker, usecols: Union[str, List[int]] = None,
):
    """Link table entities to KB

    Args:
        tables: Tables to link
        linker_config: Entity Linker config
        usecols: Columns to use (table attribute name or list of column indexes)
    """
    assert isinstance(linker, Linker)
    with linker:
        for table in tables:
            table = Table(table)
            rows = table.body

            if not rows:
                log.debug(f"No rows in table {table.get('_id')}")

            # Restrict columns to link (e.g. 'keycol', or 'entcols')
            nopunct = str.maketrans("", "", string.punctuation + " ")

            def isnum(x):
                x = x.translate(nopunct)
                return sum(map(str.isnumeric, x)) / len(x) > 0.5 if x else False

            def numscore(col):
                return sum(int(isnum(c)) for c in col) / len(col)

            def uniqscore(col):
                return len(set(col)) / len(col)

            table["non_numeric_cols"] = [
                i for i, c in enumerate(zip(*rows)) if not numscore(c) > 0.5
            ]

            def heur(col):
                return (numscore(col) < 0.5) and (uniqscore(col) > 0.9)

            heuristic_keys = [i for i, c in enumerate(zip(*rows)) if heur(c)]
            table["heuristic_key"] = heuristic_keys[0] if heuristic_keys else []

            table_usecols = table.get(str(usecols)) or table["non_numeric_cols"]
            if type(table_usecols) != list:
                table_usecols = [table_usecols]
            if not all(type(c) == int for c in table_usecols):
                log.debug(
                    f"Skipping table {table.get('_id')}, usecols = {table_usecols}"
                )
                continue

            if table_usecols:
                log.debug(
                    f"Linking columns {table_usecols} of table {table.get('_id')}"
                )
            else:
                log.debug(f"Linking table {table.get('_id')}")

            links = linker.link(rows, usecols=table_usecols, existing=table)
            table.annotations.update(links)
            yield table

            linker.flush()
Ejemplo n.º 17
0
def get_header(table1, table2):
    return Table(table1).head