def tables_add_context_rows(tables, fields=()): """Add context to table depending on table dict fields""" for table in tables: table = Table(table).to_dict() for field in list(fields)[::-1]: empty_header = { "text": f"_{field}", "surfaceLinks": [], } table["tableHeaders"] = [[empty_header] + list(hrow) for hrow in table["tableHeaders"]] tableHeaders = table["tableHeaders"] headerText = tuple( tuple([cell.get("text", "").lower() for cell in r]) for r in tableHeaders) table["headerId"] = Table.get_headerId(headerText) fieldtext = table.get(field, "") context_cells = [{ "text": fieldtext, "surfaceLinks": [{ "offset": 0, "endOffset": len(fieldtext), "linkType": "INTERNAL", "target": { "href": fieldtext }, }], }] table["tableData"] = [ copy.deepcopy(context_cells) + list(drow) for drow in table["tableData"] ] table["numCols"] = len( table["tableData"][0]) if table["tableData"] else 0 n = len(fields) if "entities" in table: table["entities"] = { str(int(ci) + n): x for ci, x in table["entities"].items() } if "classes" in table: table["classes"] = { str(int(ci) + n): x for ci, x in table["classes"].items() } if "properties" in table: table["properties"] = { str(int(fci) + n): {str(int(tci) + n): e for tci, e in te.items()} for fci, te in table["properties"].items() } yield Table(table)
def yield_pivots(headertexts: Iterable[Collection[Collection[str]]], heuristics: List[PivotFinder]): """Detect headers that should be unpivoted using heuristics.""" import copy with contextlib.ExitStack() as hstack: heuristics = [ hstack.enter_context(copy.deepcopy(h)) for h in heuristics ] named_heuristics = {h.name: h for h in heuristics} for headertext in headertexts: if headertext: pivot = find_longest_pivot(headertext, heuristics) if pivot is not None: try: dummy = [[ str(ci) for ci in range(len(next(iter(headertext)))) ]] heuristic = named_heuristics[pivot.source] heuristic.unpivot(headertext, dummy, pivot) yield Table.get_headerId(headertext), pivot except Exception as e: log.debug( f"Failed to unpivot header {headertext} with {pivot.source} due to {e}" )
def restructure(tables: typing.Iterable[Table], prefix_header_rules=(), max_cols=100) -> typing.Iterable[Table]: """Restructure tables. Performs all sorts of heuristic cleaning operations, including: - Remove empty columns (:meth:`takco.reshape.clean.remove_empty_columns`) - Deduplicate header rows (:meth:`takco.reshape.clean.deduplicate_header_rows`) - Remove empty header rows (:meth:`takco.reshape.clean.remove_empty_header_rows`) - Process rowspanning head cells (:meth:`takco.reshape.clean.process_rowspanning_head_cells`) - Restack horizontal schema repeats (:meth:`takco.reshape.clean.restack_horizontal_schema_repeats`) - Remove empty rows (:meth:`takco.reshape.clean.remove_empty_rows`) - Process rowspanning body cells (:meth:`takco.reshape.clean.process_rowspanning_body_cells`) """ for table in tables: try: table = Table(table).to_dict() if table.get("numCols", 0) >= max_cols: continue if any("tdHtmlString" in c for r in table.get("tableHeaders") for c in r): hs = table.get("tableHeaders", []) if all( c.get("tdHtmlString", "")[:3] == "<td" for r in hs for c in r): table["tableData"] = hs + table.get("tableData", []) table["tableHeaders"] = [] init_captions(table) # Analyze headers & data together deduplicate_header_rows(table) # Analyze header remove_empty_header_rows(table) process_rowspanning_head_cells(table) restack_horizontal_schema_repeats(table) table["tableHeaders"] = [h for h in table["tableHeaders"] if h] # Analyze body remove_empty_rows(table) process_rowspanning_body_cells(table) heuristic_transpose(table) remove_empty_columns(table) apply_prefix_header_rules(table, prefix_header_rules) if table["tableData"]: yield Table(table) except Exception as e: log.error(e)
def lookup_hyperlinks(tables: List[dict], lookup: Lookup, lookup_cells=False): """Lookup the (Wikipedia) hyperlinks inside cells for entity links Args: tables: Tables to link lookup_config: Configuration hash for a :mod:`takco.link.base.Lookup` object """ assert isinstance(lookup, Lookup) with lookup as look: for table in tables: table = Table(table) log.debug(f"Looking up hyperlinks of {table.get('_id')} using {look}") hrefs = get_hrefs(table.get("tableData", []), lookup_cells=lookup_cells) ents = table.annotations.setdefault("entities", {}) for ci, ri_ents in look.lookup_cells(hrefs).items(): for ri, es in ri_ents.items(): ents.setdefault(ci, {}).setdefault(ri, {}).update(es) yield table look.flush()
def split_compound_columns(tables, splitter): """Detect and split compound columns""" log.info(f"Splitting compound columns using {splitter}") with splitter: for table in tables: table = Table(table).to_dict() newcols = [] headcols = list(zip(*table.get("tableHeaders", []))) datacols = list(zip(*table.get("tableData", []))) for ci, (hcol, dcol) in enumerate(zip(headcols, datacols)): splits = list(splitter.find_splits(dcol)) if splits: log.debug( f"Found {len(splits)} splits in column {ci} of {table.get('_id')}: {list(zip(*splits))[:2]}" ) for part, _, newcol in splits: newhcol = list(hcol) if newhcol: newhcol[-1] = dict(newhcol[-1]) part = part or "" newhcol[-1]["text"] = ( newhcol[-1].get("text", "") + " " + part ) newcols.append((newhcol, newcol)) else: newcols.append((hcol, dcol)) if newcols: headcols, datacols = zip(*newcols) table["tableHeaders"] = list(zip(*headcols)) table["tableData"] = list(zip(*datacols)) yield Table(table)
def node_extract_tables(table_node): extractor = Extractor(table_node, transformer=lambda x: x) extractor.parse() all_htmlrows = [[clean_wikihtml(cell) for cell in row] for row in extractor.return_list()] all_htmlrows = hack_annoying_layouts(all_htmlrows) for htmlrows in vertically_split_tables_on_subheaders(all_htmlrows): tableId += 1 numCols = max((len(row) for row in htmlrows), default=0) td = BeautifulSoup("<td></td>", "html.parser") th = BeautifulSoup("<th></th>", "html.parser") tableHeaders = [] tableData = [] for row in htmlrows: h, e = ((tableHeaders, th) if all(c.name == "th" for c in row) else (tableData, td)) row = [(row[i] if i < len(row) else e) for i in range(numCols)] h.append([ Extractor.get_cell_dict(cell, surface_pattern, surface_links) for cell in row ]) if tableData: numDataRows = len(tableData) numHeaderRows = len(tableHeaders) log.debug(f"Extracted table {tableId} from {pgTitle}") yield Table( dict( _id=f"{pgId}#{tableId}", pgId=pgId, pgTitle=pgTitle, tableId=tableId, aboutURI=aboutURI, sectionTitle=sectionTitle, tableCaption=tableCaption, numCols=numCols, numDataRows=numDataRows, numHeaderRows=numHeaderRows, tableData=tableData, tableHeaders=tableHeaders, originalHTML=str(table), ))
def convert(docs): for doc in docs: if 'table' in doc: if 'fname' in doc: doc['table']['fname'] = doc['fname'] doc = doc['table'] if doc.get("headerPosition") == "FIRST_ROW": header, *body = zip(*doc.pop("relation")) if "url" in doc: doc['domain'] = urllib.parse.urlparse(doc["url"]).netloc if 'fname' in doc: _id = doc['fname'] else: _id = "wdc-" + str(abs(hash(str(doc)))) yield Table( { "_id": _id, "tbNr": doc.get("tableNum", 0), "pgId": doc.get("url", ""), "pgTitle": doc.get("pageTitle", "").strip() or doc.get("url", ""), "tableCaption": doc.get("title", "").strip(), "tableHeaders": [[{ "text": c } for c in header]], "tableData": [[{ "text": c } for c in row] for row in body], "numHeaderRows": 1, "numCols": len(header), "numDataRows": len(body), **doc, }, linked=False)
def try_unpivot(table, pivot, named_heuristics): try: pivotmeta = { "headerId": table.headerId, "level": pivot.level, "colfrom": pivot.colfrom, "colto": pivot.colto, "heuristic": pivot.source, "headers": pivot.headers, } heuristic = named_heuristics[pivot.source] head, body = heuristic.unpivot(table.head, table.body, pivot) return Table(head=head, body=body, provenance={ **table.provenance, "pivot": pivotmeta }) except Exception as e: log.debug(f"Cannot pivot table {table.get('_id')} due to {e}")
def integrate(tables: List[dict], db: NaryDB, pfd_threshold=0.9): """Integrate tables with n-ary relations Args: tables: Tables to link kbdir: KB directory (TODO config) pfd_threshold: Probabilistic Functional Dependency Threshold """ assert isinstance(db, NaryDB) with db: for table in tables: table = Table(table) log.debug( "Integrating table %s (%d rows)", table.get("_id"), table["numDataRows"], ) # Find key column profiler = PFDProfiler() ci_literal = { int(ci): any(SimpleTyper().is_literal_type(t) for t in ts) for ci, ts in table.annotations.get("classes", {}).items() } usecols = [ci for ci in range(table["numCols"]) if not ci_literal.get(ci)] rows = [[c.get("text") for c in row] for row in table.get("tableData", [])] keycol = profiler.get_keycol(rows, usecols) table["keycol"] = keycol log.debug(f"Got keycol {keycol}") ents = table.get("entities", {}) row_entsets = [ [ set(URIRef(s) for s in ents.get(str(ci), {}).get(str(ri), {}) if s) for ci, _ in enumerate(row) ] for ri, row in enumerate(rows) ] tocol_fromcolprop = db.integrate(rows, row_entsets) log.debug(f"Got tocol_fromcolprop {tocol_fromcolprop}") properties = {} for tocol, fromcolprop in tocol_fromcolprop.items(): for fromcol, prop in fromcolprop.items(): properties.setdefault(str(fromcol), {}).setdefault(str(tocol), prop) table.annotations["properties"] = properties yield table
def unpivot_tables( tables: Iterable[Dict], headerId_pivot: Optional[Dict[int, Pivot]], heuristics: List[PivotFinder], ): """Unpivot tables.""" tablelist = [Table(t) for t in tables] if headerId_pivot is None: headertexts = [table.head for table in tablelist] headerId_pivot = dict(yield_pivots(headertexts, heuristics=heuristics)) log.debug(f"Using {len(headerId_pivot)} detected pivots") named_heuristics = {h.name: h for h in heuristics} for table in tablelist: pivot = headerId_pivot.get(table.headerId) if pivot and table.head: table = try_unpivot(table, pivot, named_heuristics) if table is not None: yield table
def get_unannotated_tables(self) -> typing.Sequence[Table]: for table in self.tables: table = dict(table) rows = [[{ "text": c } for c in row] for row in table.pop("rows", [])] headers = [[{ "text": c } for c in row] for row in table.pop("headers", [])] yield Table( obj={ "_id": table.pop("name", ""), "tableData": rows, "tableHeaders": headers, "keycol": table.pop("keycol", None), "gold": { task: table.pop(task, {}) for task in ["entities", "classes", "properties"] }, **table, })
def coltypes(tables: List[dict], typer: Typer): """Detect column types Args: tables: Tables to link """ assert isinstance(typer, Typer) with typer: for table in tables: table = Table(table) # Find column types ci_classes = table.annotations.setdefault("classes", {}) for ci, cell_ents in enumerate(get_col_cell_ents(table)): cell_ents = list(dict(cell_ents).items()) cls_score = typer.coltype(cell_ents) ci_classes.setdefault(str(ci), {}).update(cls_score) yield table
def get_annotated_tables(self) -> typing.Sequence[Table]: return Table({table["name"]: table for table in self.tables})
def combine_by_first_header(table1, table2): return Table(table1).append(Table(table2))
def table_get_headerId(table): return Table(table).headerId
def link( tables: List[dict], linker: Linker, usecols: Union[str, List[int]] = None, ): """Link table entities to KB Args: tables: Tables to link linker_config: Entity Linker config usecols: Columns to use (table attribute name or list of column indexes) """ assert isinstance(linker, Linker) with linker: for table in tables: table = Table(table) rows = table.body if not rows: log.debug(f"No rows in table {table.get('_id')}") # Restrict columns to link (e.g. 'keycol', or 'entcols') nopunct = str.maketrans("", "", string.punctuation + " ") def isnum(x): x = x.translate(nopunct) return sum(map(str.isnumeric, x)) / len(x) > 0.5 if x else False def numscore(col): return sum(int(isnum(c)) for c in col) / len(col) def uniqscore(col): return len(set(col)) / len(col) table["non_numeric_cols"] = [ i for i, c in enumerate(zip(*rows)) if not numscore(c) > 0.5 ] def heur(col): return (numscore(col) < 0.5) and (uniqscore(col) > 0.9) heuristic_keys = [i for i, c in enumerate(zip(*rows)) if heur(c)] table["heuristic_key"] = heuristic_keys[0] if heuristic_keys else [] table_usecols = table.get(str(usecols)) or table["non_numeric_cols"] if type(table_usecols) != list: table_usecols = [table_usecols] if not all(type(c) == int for c in table_usecols): log.debug( f"Skipping table {table.get('_id')}, usecols = {table_usecols}" ) continue if table_usecols: log.debug( f"Linking columns {table_usecols} of table {table.get('_id')}" ) else: log.debug(f"Linking table {table.get('_id')}") links = linker.link(rows, usecols=table_usecols, existing=table) table.annotations.update(links) yield table linker.flush()
def get_header(table1, table2): return Table(table1).head