def readFile(path): bz_file = bz2.BZ2File(path) soup = BeautifulSoup(bz_file.read(), 'html.parser') title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) tables2d = [] for i, t in enumerate(tables): html, t2d = readHTML.tableTo2d(t) tables2d.append(t2d) article = Article(1, title, tables2d) writeArticle(article)
def extractTables(filename, folderOut, cont, dictCount): """Extract tables from html.bz, and generate a new file with only tables. :param filename: filename bz file. :param folderOut: folder where generated files will be saved. :param cont: number of file, it will be article ID. :param dictCount: save stats of table types. """ fileNameSplit = filename.split("/") try: file, file_extension = os.path.splitext( fileNameSplit[len(fileNameSplit) - 1]) if "bz2" not in file_extension: return print("[Worker %d] File numer %d" % (os.getpid(), cont)) bzFile = bz2.BZ2File(filename, "rb") soup = BeautifulSoup(bzFile.read(), 'html.parser') title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) html = "<html><head></head><body><h1 class='firstHeading'>{}</h1>".format( title) for t in tables: tableType = tableValidator.validateHTMLTable(t) dictCount[tableType.value] += 1 logging.debug('dictCount: ' + str(dictCount)) if (tableType.value == TableType.WIKITABLE.value or tableType.value == TableType.NO_CSS_CLASS.value or tableType.value == TableType.WITH_INNER_TABLE.value): html += str(t) + "<br/>" dictCount[TableType.USEFULL_TABLE.value] += 1 if "</table>" in html or "</TABLE>" in html: if folderOut.endswith("/"): newFile = bz2.open(folderOut + file + ".bz2", "wt") else: newFile = bz2.open(folderOut + "/" + file + ".bz2", "wt") html += "</body></html>" newFile.write(html) newFile.close() except: try: logging.debug('Error: ' + filename) except: print("Error name file: ", cont) traceback.print_exc()
def normalizeTables(filename): file = filename.split("##$##")[0] cont = int(filename.split("##$##")[1]) print("cont: ", cont) try: bzFile = bz2.BZ2File(file, "rb") soup = BeautifulSoup(bzFile.read(), 'html.parser') bzFile.close() except: print("Error reading file: ", filename) return str(cont) + "0\t0\t0\t0\t0\t0\n" dictStat = {} dictStat[TableType.ILL_FORMED.value] = 0 dictStat["NO_PROCESSED"] = 0 dictStat[TableType.WELL_FORMED.value] = 0 dictStat[TableType.SMALLTABLE.value] = 0 dictStat[TableType.WITH_INNER_TABLE.value] = 0 dictStat[TableType.FORMAT_BOX.value] = 0 try: title = readHTML.readTitle(soup) tables = readHTML.readTables(soup) tables2d = [] contTables = 1 formatTables = 0 for it, t in enumerate(tables): try: parents = [p.name for p in t.findParents()] if t.parent != None and ("th" in parents or "td" in parents or "tr" in parents): continue start = time.time() listt2d = readHTML.tableTo2d(t) logging.debug("Time reading table: " + str(time.time() - start)) validTables = [] if listt2d == None or len(listt2d) == 0: newTable = readHTML.saveIllTable( t, TableType.ILL_FORMED.value) if newTable != None: validTables.append(newTable) dictStat[TableType.ILL_FORMED.value] += 1 else: dictStat["NO_PROCESSED"] += 1 else: if len(listt2d) > 10: validTables.append(newTable) dictStat[TableType.ILL_FORMED.value] += 1 continue for t2d in listt2d: if t2d.tableType == TableType.FORMAT_BOX.value: dictStat[TableType.FORMAT_BOX.value] += 1 formatTables += 1 continue if t2d.tableType == TableType.SMALLTABLE.value: dictStat[TableType.SMALLTABLE.value] += 1 continue if t2d.tableType == TableType.ILL_FORMED.value: dictStat[TableType.ILL_FORMED.value] += 1 validTables.append(t2d) continue if t2d.tableType == TableType.WITH_INNER_TABLE.value: dictStat[TableType.WITH_INNER_TABLE.value] += 1 validTables.append(t2d) continue #print(t2d.toHTML()) validTables.append(t2d) dictStat[TableType.WELL_FORMED.value] += 1 for t2d in validTables: tableId = str(cont) + "." + str(contTables) t2d.setTableId(tableId) tables2d.append(t2d) contTables += 1 except: traceback.print_exc() print("Error: ", filename, it) continue if len(tables2d) > 0: article = Article(articleId=str(cont), title=title, tables=tables2d) f = open(FOLDER_OUT + "/" + str(cont) + ".json", "w") f.write( json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True)) f.close() else: if len(tables) == formatTables: logging.debug("Format table: " + filename) else: logging.debug("Error none useful table: " + filename) logging.debug(dictStat) except: traceback.print_exc() logging.debug("Error file ", filename) return str(cont)+"\t"+ str(dictStat[TableType.ILL_FORMED.value])+"\t"+ \ str(dictStat["NO_PROCESSED"])+"\t"+ \ str(dictStat[TableType.WELL_FORMED.value])+"\t"+ \ str(dictStat[TableType.SMALLTABLE.value])+"\t" + \ str(dictStat[TableType.WITH_INNER_TABLE.value])+"\t" + \ str(dictStat[TableType.FORMAT_BOX.value])+"\n"