def restructure(tables: typing.Iterable[Table], prefix_header_rules=(), max_cols=100) -> typing.Iterable[Table]: """Restructure tables. Performs all sorts of heuristic cleaning operations, including: - Remove empty columns (:meth:`takco.reshape.clean.remove_empty_columns`) - Deduplicate header rows (:meth:`takco.reshape.clean.deduplicate_header_rows`) - Remove empty header rows (:meth:`takco.reshape.clean.remove_empty_header_rows`) - Process rowspanning head cells (:meth:`takco.reshape.clean.process_rowspanning_head_cells`) - Restack horizontal schema repeats (:meth:`takco.reshape.clean.restack_horizontal_schema_repeats`) - Remove empty rows (:meth:`takco.reshape.clean.remove_empty_rows`) - Process rowspanning body cells (:meth:`takco.reshape.clean.process_rowspanning_body_cells`) """ for table in tables: try: table = Table(table).to_dict() if table.get("numCols", 0) >= max_cols: continue if any("tdHtmlString" in c for r in table.get("tableHeaders") for c in r): hs = table.get("tableHeaders", []) if all( c.get("tdHtmlString", "")[:3] == "<td" for r in hs for c in r): table["tableData"] = hs + table.get("tableData", []) table["tableHeaders"] = [] init_captions(table) # Analyze headers & data together deduplicate_header_rows(table) # Analyze header remove_empty_header_rows(table) process_rowspanning_head_cells(table) restack_horizontal_schema_repeats(table) table["tableHeaders"] = [h for h in table["tableHeaders"] if h] # Analyze body remove_empty_rows(table) process_rowspanning_body_cells(table) heuristic_transpose(table) remove_empty_columns(table) apply_prefix_header_rules(table, prefix_header_rules) if table["tableData"]: yield Table(table) except Exception as e: log.error(e)
def integrate(tables: List[dict], db: NaryDB, pfd_threshold=0.9): """Integrate tables with n-ary relations Args: tables: Tables to link kbdir: KB directory (TODO config) pfd_threshold: Probabilistic Functional Dependency Threshold """ assert isinstance(db, NaryDB) with db: for table in tables: table = Table(table) log.debug( "Integrating table %s (%d rows)", table.get("_id"), table["numDataRows"], ) # Find key column profiler = PFDProfiler() ci_literal = { int(ci): any(SimpleTyper().is_literal_type(t) for t in ts) for ci, ts in table.annotations.get("classes", {}).items() } usecols = [ci for ci in range(table["numCols"]) if not ci_literal.get(ci)] rows = [[c.get("text") for c in row] for row in table.get("tableData", [])] keycol = profiler.get_keycol(rows, usecols) table["keycol"] = keycol log.debug(f"Got keycol {keycol}") ents = table.get("entities", {}) row_entsets = [ [ set(URIRef(s) for s in ents.get(str(ci), {}).get(str(ri), {}) if s) for ci, _ in enumerate(row) ] for ri, row in enumerate(rows) ] tocol_fromcolprop = db.integrate(rows, row_entsets) log.debug(f"Got tocol_fromcolprop {tocol_fromcolprop}") properties = {} for tocol, fromcolprop in tocol_fromcolprop.items(): for fromcol, prop in fromcolprop.items(): properties.setdefault(str(fromcol), {}).setdefault(str(tocol), prop) table.annotations["properties"] = properties yield table
def tables_add_context_rows(tables, fields=()): """Add context to table depending on table dict fields""" for table in tables: table = Table(table).to_dict() for field in list(fields)[::-1]: empty_header = { "text": f"_{field}", "surfaceLinks": [], } table["tableHeaders"] = [[empty_header] + list(hrow) for hrow in table["tableHeaders"]] tableHeaders = table["tableHeaders"] headerText = tuple( tuple([cell.get("text", "").lower() for cell in r]) for r in tableHeaders) table["headerId"] = Table.get_headerId(headerText) fieldtext = table.get(field, "") context_cells = [{ "text": fieldtext, "surfaceLinks": [{ "offset": 0, "endOffset": len(fieldtext), "linkType": "INTERNAL", "target": { "href": fieldtext }, }], }] table["tableData"] = [ copy.deepcopy(context_cells) + list(drow) for drow in table["tableData"] ] table["numCols"] = len( table["tableData"][0]) if table["tableData"] else 0 n = len(fields) if "entities" in table: table["entities"] = { str(int(ci) + n): x for ci, x in table["entities"].items() } if "classes" in table: table["classes"] = { str(int(ci) + n): x for ci, x in table["classes"].items() } if "properties" in table: table["properties"] = { str(int(fci) + n): {str(int(tci) + n): e for tci, e in te.items()} for fci, te in table["properties"].items() } yield Table(table)
def split_compound_columns(tables, splitter): """Detect and split compound columns""" log.info(f"Splitting compound columns using {splitter}") with splitter: for table in tables: table = Table(table).to_dict() newcols = [] headcols = list(zip(*table.get("tableHeaders", []))) datacols = list(zip(*table.get("tableData", []))) for ci, (hcol, dcol) in enumerate(zip(headcols, datacols)): splits = list(splitter.find_splits(dcol)) if splits: log.debug( f"Found {len(splits)} splits in column {ci} of {table.get('_id')}: {list(zip(*splits))[:2]}" ) for part, _, newcol in splits: newhcol = list(hcol) if newhcol: newhcol[-1] = dict(newhcol[-1]) part = part or "" newhcol[-1]["text"] = ( newhcol[-1].get("text", "") + " " + part ) newcols.append((newhcol, newcol)) else: newcols.append((hcol, dcol)) if newcols: headcols, datacols = zip(*newcols) table["tableHeaders"] = list(zip(*headcols)) table["tableData"] = list(zip(*datacols)) yield Table(table)
def lookup_hyperlinks(tables: List[dict], lookup: Lookup, lookup_cells=False): """Lookup the (Wikipedia) hyperlinks inside cells for entity links Args: tables: Tables to link lookup_config: Configuration hash for a :mod:`takco.link.base.Lookup` object """ assert isinstance(lookup, Lookup) with lookup as look: for table in tables: table = Table(table) log.debug(f"Looking up hyperlinks of {table.get('_id')} using {look}") hrefs = get_hrefs(table.get("tableData", []), lookup_cells=lookup_cells) ents = table.annotations.setdefault("entities", {}) for ci, ri_ents in look.lookup_cells(hrefs).items(): for ri, es in ri_ents.items(): ents.setdefault(ci, {}).setdefault(ri, {}).update(es) yield table look.flush()
def link( tables: List[dict], linker: Linker, usecols: Union[str, List[int]] = None, ): """Link table entities to KB Args: tables: Tables to link linker_config: Entity Linker config usecols: Columns to use (table attribute name or list of column indexes) """ assert isinstance(linker, Linker) with linker: for table in tables: table = Table(table) rows = table.body if not rows: log.debug(f"No rows in table {table.get('_id')}") # Restrict columns to link (e.g. 'keycol', or 'entcols') nopunct = str.maketrans("", "", string.punctuation + " ") def isnum(x): x = x.translate(nopunct) return sum(map(str.isnumeric, x)) / len(x) > 0.5 if x else False def numscore(col): return sum(int(isnum(c)) for c in col) / len(col) def uniqscore(col): return len(set(col)) / len(col) table["non_numeric_cols"] = [ i for i, c in enumerate(zip(*rows)) if not numscore(c) > 0.5 ] def heur(col): return (numscore(col) < 0.5) and (uniqscore(col) > 0.9) heuristic_keys = [i for i, c in enumerate(zip(*rows)) if heur(c)] table["heuristic_key"] = heuristic_keys[0] if heuristic_keys else [] table_usecols = table.get(str(usecols)) or table["non_numeric_cols"] if type(table_usecols) != list: table_usecols = [table_usecols] if not all(type(c) == int for c in table_usecols): log.debug( f"Skipping table {table.get('_id')}, usecols = {table_usecols}" ) continue if table_usecols: log.debug( f"Linking columns {table_usecols} of table {table.get('_id')}" ) else: log.debug(f"Linking table {table.get('_id')}") links = linker.link(rows, usecols=table_usecols, existing=table) table.annotations.update(links) yield table linker.flush()