def getColumnHeaders(tableMatrix): startRows, colHeaders = readHTML.getMainColHeaders(tableMatrix) startRows += 1 if len(set(colHeaders)) == 1 and colHeaders[0] == "": colHeaders = [] else: colHeaders = [h.lower().strip().replace(" ", "_") + "@" + str( readHTML.getColumnType(i, startRows, tableMatrix)) if h != "" else "spancol@" + str( readHTML.getColumnType(i, startRows, tableMatrix)) for i, h in enumerate(colHeaders)] return colHeaders
def updateJsonFile(fileName): fileNameSplit = fileName.split("/") file, file_extension = os.path.splitext(fileNameSplit[len(fileNameSplit) - 1]) if "json" not in file_extension: return jsonFile = open(fileName, "r") obj = jsonFile.read() #stemmer = SnowballStemmer("english") try: obj = json.loads(obj) article = ComplexDecoder().default(obj) lineTables = "" tables2D = [] out = "" for table in article.tables: if table.tableType == None or table.tableType.value == "": table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue if table.tableType.value != TableType.WELL_FORMED.value: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue else: try: startRow, headers = readHTML.getMainColHeaders( table.htmlMatrix) except Exception as ex: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(0) tables2D.append(table) continue if startRow == 0: table.setTableType(table.tableType.value) table.setColHeaders([]) table.setStartRows(startRow) tables2D.append(table) continue table.setStartRows(startRow) #startRow = int(table.startRows) matrix = np.array(table.htmlMatrix) listOfLevelHeaders = [] for i in range(startRow): listOfLevelHeaders.append(matrix[i]) headersMatch = [] for row in listOfLevelHeaders: cleanTagHeaders = [] for col in range(len(row)): cell = BeautifulSoup(row[col], "html.parser") cell = readHTML.cleanTableCellTag(cell) text = " ".join( [s for s in cell.strings if s.strip('\n ') != '']) text = text.replace("*", "").replace("@", "") cleanTagHeaders.append(text) cleanTagHeaders = [ textProcessing.cleanCellHeader(h) for h in cleanTagHeaders ] headersMatch.append(cleanTagHeaders) lastRow = headersMatch[len(headersMatch) - 1] headersMatch[len(headersMatch) - 1] = [ 'spancol' if h == '' else h for h in lastRow ] newHeader = [] for col in range(len(headersMatch[0])): textCol = headersMatch[0][col] for row in range(1, len(headersMatch)): textCol += "**" + headersMatch[row][col] newHeader.append(textCol) newHeader = [re.sub('^\\**', '', h) for h in newHeader] if startRow > 1: newHeader = [ h[:-2] if h.endswith("**") else h for h in newHeader ] newHeader = textProcessing.orderHeaders(newHeader) newHeaderType = [] for i, col in enumerate(newHeader): type = readHTML.getColumnType(i, startRow, table.htmlMatrix) newHeaderType.append(newHeader[i] + "@" + str(type)) table.setColHeaders(newHeaderType) table.ncols = len(newHeaderType) table.setTableType(table.tableType.value) tables2D.append(table) try: out = extractLinks2(article.title, table) except Exception as ex1: print("Error links extraction: ", table.tableId) traceback.print_exc() article.setTables(tables2D) #article = Article(articleId=article., title=title, tables=tables2d) f = open(FOLDER_OUT + "/" + file + ".json", "w") f.write( json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True)) f.close() return out except Exception as ex: print("Error File: ", file) traceback.print_exc()