Esempio n. 1
0
def readFile(path):
    bz_file = bz2.BZ2File(path)
    soup = BeautifulSoup(bz_file.read(), 'html.parser')
    title = readHTML.readTitle(soup)
    tables = readHTML.readTables(soup)
    tables2d = []
    for i, t in enumerate(tables):
        html, t2d = readHTML.tableTo2d(t)
        tables2d.append(t2d)
    article = Article(1, title, tables2d)
    writeArticle(article)
def extractTables(filename, folderOut, cont, dictCount):
    """Extract tables from html.bz, and generate a new file with only tables.
        :param filename: filename bz file.
        :param folderOut: folder where generated files will be saved.
        :param cont: number of file, it will be article ID.
        :param dictCount: save stats of table types.
    """
    fileNameSplit = filename.split("/")
    try:
        file, file_extension = os.path.splitext(
            fileNameSplit[len(fileNameSplit) - 1])
        if "bz2" not in file_extension:
            return
        print("[Worker %d] File numer %d" % (os.getpid(), cont))
        bzFile = bz2.BZ2File(filename, "rb")
        soup = BeautifulSoup(bzFile.read(), 'html.parser')
        title = readHTML.readTitle(soup)
        tables = readHTML.readTables(soup)

        html = "<html><head></head><body><h1 class='firstHeading'>{}</h1>".format(
            title)
        for t in tables:
            tableType = tableValidator.validateHTMLTable(t)
            dictCount[tableType.value] += 1
            logging.debug('dictCount: ' + str(dictCount))
            if (tableType.value == TableType.WIKITABLE.value
                    or tableType.value == TableType.NO_CSS_CLASS.value
                    or tableType.value == TableType.WITH_INNER_TABLE.value):
                html += str(t) + "<br/>"
                dictCount[TableType.USEFULL_TABLE.value] += 1

        if "</table>" in html or "</TABLE>" in html:
            if folderOut.endswith("/"):
                newFile = bz2.open(folderOut + file + ".bz2", "wt")
            else:
                newFile = bz2.open(folderOut + "/" + file + ".bz2", "wt")
            html += "</body></html>"
            newFile.write(html)
            newFile.close()
    except:
        try:
            logging.debug('Error: ' + filename)
        except:
            print("Error name file: ", cont)
        traceback.print_exc()
Esempio n. 3
0
def normalizeTables(filename):
    file = filename.split("##$##")[0]
    cont = int(filename.split("##$##")[1])
    print("cont: ", cont)
    try:
        bzFile = bz2.BZ2File(file, "rb")
        soup = BeautifulSoup(bzFile.read(), 'html.parser')
        bzFile.close()
    except:
        print("Error reading file: ", filename)
        return str(cont) + "0\t0\t0\t0\t0\t0\n"
    dictStat = {}
    dictStat[TableType.ILL_FORMED.value] = 0
    dictStat["NO_PROCESSED"] = 0
    dictStat[TableType.WELL_FORMED.value] = 0
    dictStat[TableType.SMALLTABLE.value] = 0
    dictStat[TableType.WITH_INNER_TABLE.value] = 0
    dictStat[TableType.FORMAT_BOX.value] = 0
    try:
        title = readHTML.readTitle(soup)
        tables = readHTML.readTables(soup)
        tables2d = []
        contTables = 1
        formatTables = 0
        for it, t in enumerate(tables):
            try:
                parents = [p.name for p in t.findParents()]
                if t.parent != None and ("th" in parents or "td" in parents
                                         or "tr" in parents):
                    continue
                start = time.time()
                listt2d = readHTML.tableTo2d(t)
                logging.debug("Time reading table: " +
                              str(time.time() - start))
                validTables = []
                if listt2d == None or len(listt2d) == 0:

                    newTable = readHTML.saveIllTable(
                        t, TableType.ILL_FORMED.value)
                    if newTable != None:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                    else:
                        dictStat["NO_PROCESSED"] += 1

                else:
                    if len(listt2d) > 10:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                        continue
                    for t2d in listt2d:
                        if t2d.tableType == TableType.FORMAT_BOX.value:
                            dictStat[TableType.FORMAT_BOX.value] += 1
                            formatTables += 1
                            continue

                        if t2d.tableType == TableType.SMALLTABLE.value:
                            dictStat[TableType.SMALLTABLE.value] += 1
                            continue

                        if t2d.tableType == TableType.ILL_FORMED.value:
                            dictStat[TableType.ILL_FORMED.value] += 1
                            validTables.append(t2d)
                            continue

                        if t2d.tableType == TableType.WITH_INNER_TABLE.value:
                            dictStat[TableType.WITH_INNER_TABLE.value] += 1
                            validTables.append(t2d)
                            continue
                        #print(t2d.toHTML())
                        validTables.append(t2d)
                        dictStat[TableType.WELL_FORMED.value] += 1

                for t2d in validTables:
                    tableId = str(cont) + "." + str(contTables)
                    t2d.setTableId(tableId)
                    tables2d.append(t2d)
                    contTables += 1
            except:
                traceback.print_exc()
                print("Error: ", filename, it)
                continue
        if len(tables2d) > 0:
            article = Article(articleId=str(cont),
                              title=title,
                              tables=tables2d)
            f = open(FOLDER_OUT + "/" + str(cont) + ".json", "w")
            f.write(
                json.dumps(article.reprJSON(),
                           cls=ComplexEncoder,
                           skipkeys=True))
            f.close()
        else:
            if len(tables) == formatTables:
                logging.debug("Format table: " + filename)
            else:
                logging.debug("Error none useful table: " + filename)
        logging.debug(dictStat)
    except:
        traceback.print_exc()
        logging.debug("Error file ", filename)

    return str(cont)+"\t"+  str(dictStat[TableType.ILL_FORMED.value])+"\t"+ \
        str(dictStat["NO_PROCESSED"])+"\t"+ \
        str(dictStat[TableType.WELL_FORMED.value])+"\t"+ \
        str(dictStat[TableType.SMALLTABLE.value])+"\t" + \
        str(dictStat[TableType.WITH_INNER_TABLE.value])+"\t" + \
        str(dictStat[TableType.FORMAT_BOX.value])+"\n"