Exemple #1
0
def getColumnHeaders(tableMatrix):
    startRows, colHeaders = readHTML.getMainColHeaders(tableMatrix)
    startRows += 1

    if len(set(colHeaders)) == 1 and colHeaders[0] == "":
        colHeaders = []
    else:
        colHeaders = [h.lower().strip().replace(" ", "_") + "@" + str(
            readHTML.getColumnType(i, startRows, tableMatrix)) if h != "" else "spancol@" + str(
            readHTML.getColumnType(i, startRows, tableMatrix)) for i, h in enumerate(colHeaders)]
    return colHeaders
Exemple #2
0
def updateJsonFile(fileName):
    fileNameSplit = fileName.split("/")
    file, file_extension = os.path.splitext(fileNameSplit[len(fileNameSplit) -
                                                          1])
    if "json" not in file_extension:
        return
    jsonFile = open(fileName, "r")
    obj = jsonFile.read()

    #stemmer = SnowballStemmer("english")
    try:
        obj = json.loads(obj)
        article = ComplexDecoder().default(obj)
        lineTables = ""
        tables2D = []
        out = ""
        for table in article.tables:

            if table.tableType == None or table.tableType.value == "":
                table.setColHeaders([])
                table.setStartRows(0)
                tables2D.append(table)
                continue
            if table.tableType.value != TableType.WELL_FORMED.value:
                table.setTableType(table.tableType.value)
                table.setColHeaders([])
                table.setStartRows(0)
                tables2D.append(table)
                continue
            else:
                try:
                    startRow, headers = readHTML.getMainColHeaders(
                        table.htmlMatrix)
                except Exception as ex:
                    table.setTableType(table.tableType.value)
                    table.setColHeaders([])
                    table.setStartRows(0)
                    tables2D.append(table)
                    continue

                if startRow == 0:
                    table.setTableType(table.tableType.value)
                    table.setColHeaders([])
                    table.setStartRows(startRow)
                    tables2D.append(table)
                    continue
                table.setStartRows(startRow)

                #startRow = int(table.startRows)
                matrix = np.array(table.htmlMatrix)
                listOfLevelHeaders = []
                for i in range(startRow):
                    listOfLevelHeaders.append(matrix[i])
                headersMatch = []
                for row in listOfLevelHeaders:
                    cleanTagHeaders = []
                    for col in range(len(row)):
                        cell = BeautifulSoup(row[col], "html.parser")
                        cell = readHTML.cleanTableCellTag(cell)
                        text = " ".join(
                            [s for s in cell.strings if s.strip('\n ') != ''])
                        text = text.replace("*", "").replace("@", "")
                        cleanTagHeaders.append(text)
                        cleanTagHeaders = [
                            textProcessing.cleanCellHeader(h)
                            for h in cleanTagHeaders
                        ]
                    headersMatch.append(cleanTagHeaders)
                lastRow = headersMatch[len(headersMatch) - 1]
                headersMatch[len(headersMatch) - 1] = [
                    'spancol' if h == '' else h for h in lastRow
                ]
                newHeader = []
                for col in range(len(headersMatch[0])):
                    textCol = headersMatch[0][col]
                    for row in range(1, len(headersMatch)):
                        textCol += "**" + headersMatch[row][col]
                    newHeader.append(textCol)
                newHeader = [re.sub('^\\**', '', h) for h in newHeader]
                if startRow > 1:
                    newHeader = [
                        h[:-2] if h.endswith("**") else h for h in newHeader
                    ]
                newHeader = textProcessing.orderHeaders(newHeader)
                newHeaderType = []
                for i, col in enumerate(newHeader):
                    type = readHTML.getColumnType(i, startRow,
                                                  table.htmlMatrix)
                    newHeaderType.append(newHeader[i] + "@" + str(type))
                table.setColHeaders(newHeaderType)
                table.ncols = len(newHeaderType)
                table.setTableType(table.tableType.value)
                tables2D.append(table)
                try:
                    out = extractLinks2(article.title, table)
                except Exception as ex1:
                    print("Error links extraction: ", table.tableId)
                    traceback.print_exc()

        article.setTables(tables2D)
        #article = Article(articleId=article., title=title, tables=tables2d)
        f = open(FOLDER_OUT + "/" + file + ".json", "w")
        f.write(
            json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True))
        f.close()

        return out
    except Exception as ex:
        print("Error File: ", file)
        traceback.print_exc()