Exemple #1
0
    def test_rowspan(self):
        html = """<html><body><table border="1" class="wikitable"><tr><td>1</td>
        <td colspan="2">2 and 3</td><td>4</td></tr>
        <tr><td rowspan="3">5,9 and 13</td>
        <td>6</td><td>7</td><td>8</td></tr>
        <tr><td>10</td><td>11</td><td>12</td></tr>
        <tr><td colspan="3">14,15 and 16</td></tr>
        </table></body></html>"""
        soup = BeautifulSoup(html, 'html.parser')
        soup=readHTML.readTables(soup)[0]
        listt= readHTML.tableTo2d(soup)
        table2d=listt[0]
        table2d = table2d.toHTML()
        self.assertFalse(table2d is None)

        result = """<table ><tr><td>1</td>
                <td>2 and 3</td><td>2 and 3</td><td>4</td></tr>
                <tr><td>5,9 and 13</td>
                <td>6</td><td>7</td><td>8</td></tr>
                <tr><td>5,9 and 13</td><td>10</td><td>11</td><td>12</td></tr>
                <tr><td>5,9 and 13</td><td>14,15 and 16</td><td>14,15 and 16</td><td>14,15 and 16</td></tr>
                </table>""".replace(" ", "").replace("\n", "")
        #resultSoup=BeautifulSoup(table2d,"html.parser")
        #resultSoup=readHTML.removeSpanAttrs(resultSoup)
        resultSoup = str(table2d).replace(" ", "").replace("\n", "")
        self.assertEqual(resultSoup,result)
Exemple #2
0
 def testHeaders(self):
     html="""<table class="wikitable sortable jquery-tablesorter">
         <thead></thead><tbody><tr><td>Season</td><td>2017</td>
         <td>2018</td><td><b>Total</b></td></tr><tr align="center"><td>Wins
         </td><td>1</td><td>0</td><td><b>1</b></td></tr></tbody><tfoot></tfoot></table>"""
     soup = BeautifulSoup(html, 'html.parser')
     tables = readHTML.readTables(soup)
     listt = readHTML.tableTo2d(tables[0])
     table2d = listt[0]
     headers = readHTML.getMainColHeaders(table2d.htmlMatrix)
Exemple #3
0
def readFile(path):
    bz_file = bz2.BZ2File(path)
    soup = BeautifulSoup(bz_file.read(), 'html.parser')
    title = readHTML.readTitle(soup)
    tables = readHTML.readTables(soup)
    tables2d = []
    for i, t in enumerate(tables):
        html, t2d = readHTML.tableTo2d(t)
        tables2d.append(t2d)
    article = Article(1, title, tables2d)
    writeArticle(article)
Exemple #4
0
def getColumnHeaders(tableMatrix):
    startRows, colHeaders = readHTML.getMainColHeaders(tableMatrix)
    startRows += 1

    if len(set(colHeaders)) == 1 and colHeaders[0] == "":
        colHeaders = []
    else:
        colHeaders = [h.lower().strip().replace(" ", "_") + "@" + str(
            readHTML.getColumnType(i, startRows, tableMatrix)) if h != "" else "spancol@" + str(
            readHTML.getColumnType(i, startRows, tableMatrix)) for i, h in enumerate(colHeaders)]
    return colHeaders
Exemple #5
0
 def test_colspan(self):
     html = """<!DOCTYPE html><html>
         <head> <meta charset="utf-8"><meta name="description" content=""><meta name="keywords" content="">
             <title>Table Practice</title></head>
         <body><table class="wikitable" border="1" align="center" cellpadding="10px">
         <thead><tr><th rowspan="3">Day</th>
         <th colspan="3">Seminar</th></tr>
         <tr><th colspan="2">Schedule</th>
         <th rowspan="2">Topic</th></tr>
         <tr><th>Begin</th><th>End</th></tr></thead>
         <tbody><tr><td rowspan="2">Monday</td>
         <td rowspan="2">8:00 a.m</td><td rowspan="2">5:00 p.m</td>
         <td rowspan="">Introduction to XML</td></tr>
         <tr><td rowspan="">Validity: DTD and Relax NG</td>
         </tr><tr><td rowspan="4">Tuesday</td>
         <td>8:00 a.m</td><td>11:00 a.m</td>
         <td rowspan="2">XPath</td></tr><tr>
         <td rowspan="2">11:00 a.m</td>
         <td rowspan="2">2:00 p.m</td>
         </tr><tr><td rowspan="2">XSL transformation</td></tr>
         <tr><td>2:00 p.m</td><td>5:00 p.m</td></tr>
         <tr><td>Wednesday</td><td>8:00 a.m</td><td>12:00 p.m</td><td>XLS Formatting Objects</td>
         </tr></tbody></table>
         </body>
         </html>
         """
     soup = BeautifulSoup(html, 'html.parser')
     tables = readHTML.readTables(soup)
     lt = len(tables)
     self.assertEqual(lt, 1)
     listt = readHTML.tableTo2d(tables[0])
     table2d = listt[0]
     table2d = table2d.toHTML()
     self.assertFalse(table2d is None)
     table2dcontent = table2d.replace(" ", "").replace("\n", "")
     result = """<table>
     <tr><th>Day</th><th>Seminar</th><th>Seminar</th><th>Seminar</th></tr>
     <tr><th>Day</th><th>Schedule</th><th>Schedule</th><th>Topic</th></tr>
     <tr><th>Day</th><th>Begin</th><th>End</th><th>Topic</th></tr>
     <tr><td>Monday</td><td>8:00 a.m</td><td>5:00 p.m</td><td>Introduction to XML</td></tr>
     <tr><td>Monday</td><td>8:00 a.m</td><td>5:00 p.m</td><td>Validity: DTD and Relax NG</td></tr>
     <tr><td>Tuesday</td><td>8:00 a.m</td><td>11:00 a.m</td><td>XPath</td></tr>
     <tr><td>Tuesday</td><td>11:00 a.m</td><td>2:00 p.m</td><td>XPath</td></tr>
     <tr><td>Tuesday</td><td>11:00 a.m</td><td>2:00 p.m</td><td>XSL transformation</td></tr>
     <tr><td>Tuesday</td><td>2:00 p.m</td><td>5:00 p.m</td><td>XSL transformation</td></tr>
     <tr><td>Wednesday</td><td>8:00 a.m</td><td>12:00 p.m</td><td>XLS Formatting Objects</td></tr></table>""".replace(
         " ", "").replace("\n", "")
     print(table2dcontent)
     self.assertEqual(table2dcontent, result)
Exemple #6
0
 def test_without_span(self):
     html = """<table border="1" class="wikitable">
     <tr><td>1</td><td>2</td><td>3</td></tr>
     <tr><td>4</td><td>5</td><td>6</td></tr>
     </table>"""
     soup = BeautifulSoup(html, 'html.parser')
     listt = readHTML.tableTo2d(readHTML.readTables(soup)[0])
     table2d = listt[0]
     table2d=table2d.toHTML()
     self.assertFalse(table2d is None)
     table2dcontent = table2d.replace(" ", "").replace("\n", "")
     result = """<table>
     <tr><td>1</td><td>2 </td><td>3</td></tr>
     <tr><td>4</td><td>5</td><td>6</td></tr>
     </table>""".replace(" ", "").replace("\n", "")
     self.assertEqual(table2dcontent, result)
def extractCellResources(content):
    #Extract links from cells and get Wikidata IDs for cell (content)
    bscell = BeautifulSoup(content, "html.parser")
    linksCell = readHTML.readTableCellLinks(bscell)

    if linksCell == None or len(linksCell) == 0:
        return []

    resources = {}
    for i, link in enumerate(linksCell):
        _link = wikiLink(link)
        if _link != None and _link != "":
            wd = wikidataDAO.getWikidataID(_link)
            if wd != "" and wd != None:
                resource = Resource(_link)
                resource.setId(wd)
                resources[_link] = resource
            else:
                resource = Resource(_link)
                resources[_link] = resource
        else:
            resource = Resource("ex: " + _link)
            resources["ex: " + _link] = resource
    #print("List resources:", resources)
    resources = list(resources.values())
    return resources
Exemple #8
0
def updateJsonFile(fileName):
    print("filename: ", fileName)
    tableId = fileName.split("/")[len(fileName.split("/")) - 1].replace(
        ".json", "").replace("_", ".")
    file = open(fileName, "r")
    obj = file.read()
    obj = json.loads(obj)
    # Converting json to Table object
    table = ComplexDecoderTable().default(obj)

    if table.tableType == None or table.tableType.value == "":
        table.setColHeaders([])
        table.setStartRows(0)
        writeTable(table, tableId)
        return
    if table.tableType.value != TableType.WELL_FORMED.value:
        table.setTableType(table.tableType.value)
        table.setColHeaders([])
        table.setStartRows(0)
        writeTable(table, tableId)
    else:
        startRow = table.startRows
        if startRow == 0:
            table.setTableType(table.tableType.value)
            table.setColHeaders([])
            table.setStartRows(startRow)
            writeTable(table, tableId)
        else:
            table.setTableType(table.tableType.value)
            startRows, colHeadersType = readHTML.getColHeaderAllLevels(
                table.htmlMatrix, table.startRows, textProcessing)
            table.setColHeaders(colHeadersType)
            writeTable(table, tableId)
Exemple #9
0
 def test_innerTable(self):
     html = """<table><tbody><tr valign="top"><td><table class="wikitable" style="font-size:95%"><tbody><tr bgcolor="#efefef">
     <td colspan="2"><b>Legend</b></td></tr><tr bgcolor="#f3e6d7"><td>Grand Slam</td><td align="center">0</td></tr><tr bgcolor="#ffffcc">
     <td>WTA Championships</td><td align="center">0</td></tr><tr bgcolor="#ffcccc"><td>Tier I</td><td align="center">0</td></tr>
     <tr bgcolor="#ccccff"><td>Tier II</td><td align="center">0</td></tr><tr bgcolor="#CCFFCC"><td>Tier III</td><td align="center">0
     </td></tr><tr bgcolor="#66CCFF"><td>Tier IV &amp; V</td><td align="center">0</td></tr></tbody></table></td><td><table class="wikitable" style="font-size:95%">
     <tbody><tr bgcolor="#efefef"><td colspan="2"><b>Titles by Surface</b></td></tr><tr><td>Hard</td>
     <td align="center">0</td></tr><tr><td>Clay</td><td align="center">0</td></tr><tr><td>Grass</td>
     <td align="center">0</td></tr><tr><td>Carpet</td><td align="center">0</td></tr></tbody></table></td></tr></tbody></table>"""
     soup = BeautifulSoup(html, 'html.parser')
     tables = readHTML.readTables(soup)
     listt = readHTML.tableTo2d(tables[0])
     table2d = listt[0]
     headers=readHTML.getMainColHeaders(table2d.htmlMatrix)
     self.assertFalse(table2d is None)
     table2d=table2d.toHTML()
     table2dcontent = table2d.replace(" ", "").replace("\n", "")
Exemple #10
0
    def test_interTitle(self):

        html = """<table class="x wikitable y" border="1" cellpadding="10px" align="center"><thead>
        <tr><th rowspan="3">A</th><th colspan="3">B</th></tr>
        <tr><th colspan="2">C</th><th rowspan="2">F</th></tr>
        <tr><th>D</th><th>E</th></tr></thead><tbody>
        <tr><td rowspan="2">1</td><td rowspan="2">a</td><td rowspan="2">b</td><td>x</td></tr>
        <tr><td>y</td></tr>
        <tr><td rowspan="4">2</td><td colspan="2">cd</td><td rowspan="2">z</td></tr>
        <tr><td rowspan="2">e</td><td rowspan="2">f</td></tr>
        <tr><td rowspan="2">w</td></tr>
        <tr><td>g</td><td>h</td></tr>
        <tr><th colspan="4">3</th></tr><tr>
        <td rowspan="4">4</td><td colspan="3">ijr</td></tr>
        <tr><td rowspan="2">5</td><td colspan="2">m</td></tr>
        <tr><td rowspan="2">n</td><td colspan="2">s</td></tr>
        <tr><td>6</td> <td>t</td></tr></tbody></table>"""
        soup = BeautifulSoup(html, 'html.parser')
        tables = readHTML.readTables(soup)
        lt = len(tables)
        self.assertEqual(lt, 1)
        listt = readHTML.tableTo2d(tables[0])
        table2d = listt[0]
        table2d=table2d.toHTML()
        self.assertFalse(table2d is None)
        table2dcontent = table2d.replace(" ", "").replace("\n", "")
        result = """<table>
        <tr><th>A</th><th>B</th><th>B</th><th>B</th></tr>
        <tr><th>A</th><th>C</th><th>C</th><th>F</th></tr>
        <tr><th>A</th><th>D</th><th>E</th><th>F</th></tr>
        <tr><td>1</td><td>a</td><td>b</td><td>x</td></tr>
        <tr><td>1</td><td>a</td><td>b</td><td>y</td></tr>
        <tr><td>2</td><td>cd</td><td>cd</td><td>z</td></tr>
        <tr><td>2</td><td>e</td><td>f</td><td>z</td></tr>
        <tr><td>2</td><td>e</td><td>f</td><td>w</td></tr>
        <tr><td>2</td><td>g</td><td>h</td><td>w</td></tr>
        <tr><th>3</th><th>3</th><th>3</th><th>3</th></tr>
        <tr><td >4</td><td >ijr</td><td >ijr</td><td >ijr</td></tr>
        <tr><td >4</td><td >5</td><td >m</td><td >m</td></tr>
        <tr><td >4</td><td >5</td><td >n</td><td >s</td></tr>
        <tr><td >4</td><td >6</td><td >n</td><td >t</td></tr>
        </table>""".replace(
            " ", "").replace("\n", "")
        print(table2dcontent)
        print(result)
        self.assertEqual(table2dcontent, result)
def extractTables(filename, folderOut, cont, dictCount):
    """Extract tables from html.bz, and generate a new file with only tables.
        :param filename: filename bz file.
        :param folderOut: folder where generated files will be saved.
        :param cont: number of file, it will be article ID.
        :param dictCount: save stats of table types.
    """
    fileNameSplit = filename.split("/")
    try:
        file, file_extension = os.path.splitext(
            fileNameSplit[len(fileNameSplit) - 1])
        if "bz2" not in file_extension:
            return
        print("[Worker %d] File numer %d" % (os.getpid(), cont))
        bzFile = bz2.BZ2File(filename, "rb")
        soup = BeautifulSoup(bzFile.read(), 'html.parser')
        title = readHTML.readTitle(soup)
        tables = readHTML.readTables(soup)

        html = "<html><head></head><body><h1 class='firstHeading'>{}</h1>".format(
            title)
        for t in tables:
            tableType = tableValidator.validateHTMLTable(t)
            dictCount[tableType.value] += 1
            logging.debug('dictCount: ' + str(dictCount))
            if (tableType.value == TableType.WIKITABLE.value
                    or tableType.value == TableType.NO_CSS_CLASS.value
                    or tableType.value == TableType.WITH_INNER_TABLE.value):
                html += str(t) + "<br/>"
                dictCount[TableType.USEFULL_TABLE.value] += 1

        if "</table>" in html or "</TABLE>" in html:
            if folderOut.endswith("/"):
                newFile = bz2.open(folderOut + file + ".bz2", "wt")
            else:
                newFile = bz2.open(folderOut + "/" + file + ".bz2", "wt")
            html += "</body></html>"
            newFile.write(html)
            newFile.close()
    except:
        try:
            logging.debug('Error: ' + filename)
        except:
            print("Error name file: ", cont)
        traceback.print_exc()
Exemple #12
0
 def testCaption(self):
     html = """<html><body><table class="wikitable" style="text-align: center;float:right;">
     <caption>Irish stadiums in 1999 World Cup</caption><tbody><tr>
     <td><b>City</b></td><td><b>Stadium</b></td><td><b>Capacity</b>
     </td></tr><tr><td><span class="flagicon"><a href="/wiki/Republic_of_Ireland" title="Republic of Ireland"><img alt="Republic of Ireland" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/23px-Flag_of_Ireland.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/35px-Flag_of_Ireland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/46px-Flag_of_Ireland.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span> <a href="/wiki/Dublin" title="Dublin">Dublin</a></td>
     <td><a href="/wiki/Lansdowne_Road" title="Lansdowne Road">Lansdowne Road</a></td>
     <td>49,250</td></tr><tr><td><span class="flagicon"><a href="/wiki/Republic_of_Ireland" title="Republic of Ireland"><img alt="Republic of Ireland" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/23px-Flag_of_Ireland.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/35px-Flag_of_Ireland.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Flag_of_Ireland.svg/46px-Flag_of_Ireland.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span> <a href="/wiki/Limerick" title="Limerick">Limerick</a></td>
     <td><a href="/wiki/Thomond_Park" title="Thomond Park">Thomond Park</a></td>
     <td>13,500</td></tr><tr><td><span class="flagicon"><a href="/wiki/United_Kingdom" title="United Kingdom"><img alt="United Kingdom" src="//upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/23px-Flag_of_the_United_Kingdom.svg.png" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/35px-Flag_of_the_United_Kingdom.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/a/ae/Flag_of_the_United_Kingdom.svg/46px-Flag_of_the_United_Kingdom.svg.png 2x" data-file-width="1200" data-file-height="600" width="23" height="12"></a></span>  <a href="/wiki/Belfast" title="Belfast">Belfast</a></td>
     <td><a href="/wiki/Ravenhill_Stadium" class="mw-redirect" title="Ravenhill Stadium">Ravenhill Stadium</a></td>
     <td>12,500</td></tr></tbody></table><html><body>"""
     soup = BeautifulSoup(html, 'html.parser')
     tables = readHTML.readTables(soup)
     listt = readHTML.tableTo2d(tables[0])
     table2d = listt[0]
     headers = readHTML.getMainColHeaders(table2d.htmlMatrix)
     self.assertFalse(table2d is None)
     table2dcontent = table2d.toHTML().replace(" ", "").replace("\n", "")
Exemple #13
0
    def test_innerEqualTables(self):
        html ="""<table><tbody><tr><td width="10%" valign="top"><table class="wikitable">
        <tbody><tr><th width="150">Pool A</th><th width="15">W</th><th width="15">L</th></tr>
        <tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Wisconsin" title="Wisconsin">
        <img alt="Wisconsin" src="//upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/23px-Flag_of_Wisconsin.svg.png"
         width="23" height="15" class="thumbborder" 
         srcset="//upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/35px-Flag_of_Wisconsin.svg.png 1.5x,
          //upload.wikimedia.org/wikipedia/commons/thumb/2/22/Flag_of_Wisconsin.svg/45px-Flag_of_Wisconsin.svg.png 2x" 
          data-file-width="675" data-file-height="450"></a></span> <a href="/wiki/Erika_Brown" 
          title="Erika Brown">Erika Brown</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td>
          <span class="flagicon"><a href="/wiki/Massachusetts" title="Massachusetts"><img alt="Massachusetts" 
          src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/23px-Flag_of_Massachusetts.svg.png" 
          width="23" height="14" class="thumbborder" 
          srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/35px-Flag_of_Massachusetts.svg.png 1.5x, 
          //upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/46px-Flag_of_Massachusetts.svg.png 2x" data-file-width="1500" data-file-height="900">
          </a></span> <a href="/wiki/Korey_Dropkin" title="Korey Dropkin">Korey Dropkin</a></td><td>3</td><td>1</td></tr>
          <tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" 
          src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" 
          height="12" class="thumbborder" 
          srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, 
          //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" 
          data-file-width="2400" data-file-height="1200">
          </a></span> <a href="/w/index.php?title=Ben_Bevan&amp;action=edit&amp;redlink=1" class="new" 
          title="Ben Bevan (page does not exist)">Ben Bevan</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon">
          <a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" 
          srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, 
          //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" 
          data-file-width="500" data-file-height="318"></a></span> <a href="/wiki/Cory_Christensen" 
          title="Cory Christensen">Cory Christensen</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Pennsylvania" title="Pennsylvania"><img alt="Pennsylvania" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/23px-Flag_of_Pennsylvania.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/35px-Flag_of_Pennsylvania.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Flag_of_Pennsylvania.svg/45px-Flag_of_Pennsylvania.svg.png 2x" data-file-width="675" data-file-height="450"></a></span> <a href="/w/index.php?title=Nicholas_Visnich&amp;action=edit&amp;redlink=1" class="new" title="Nicholas Visnich (page does not exist)">Nicholas Visnich</a></td><td>0</td><td>4</td></tr></tbody></table></td><td width="10%" valign="top"><table class="wikitable"><tbody><tr><th width="150">Pool B</th><th width="15">W</th><th width="15">L</th></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/wiki/Scott_McDonald_(curler)" title="Scott McDonald (curler)">Scott McDonald</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/wiki/Alexandra_Carlson" title="Alexandra Carlson">Alexandra Carlson</a></td><td>2</td><td>2</td></tr><tr bgcolor="#ccffcc"><td><span class="flagicon"><a href="/wiki/California" title="California"><img alt="California" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/23px-Flag_of_California.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/35px-Flag_of_California.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/01/Flag_of_California.svg/45px-Flag_of_California.svg.png 2x" data-file-width="900" data-file-height="600"></a></span> <a href="/w/index.php?title=Gabrielle_Coleman&amp;action=edit&amp;redlink=1" class="new" title="Gabrielle Coleman (page does not exist)">Gabrielle Coleman</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/w/index.php?title=Trevor_Brewer_(curler)&amp;action=edit&amp;redlink=1" class="new" title="Trevor Brewer (curler) (page does not exist)">Trevor Brewer</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/w/index.php?title=Ethan_Meyers&amp;action=edit&amp;redlink=1" class="new" title="Ethan Meyers (page does not exist)">Ethan Meyers</a></td><td>1</td><td>3</td></tr></tbody></table></td><td width="10%" valign="top"><table class="wikitable"><tbody><tr><th width="150">Pool C</th><th width="15">W</th><th width="15">L</th></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Minnesota" title="Minnesota"><img alt="Minnesota" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/23px-Flag_of_Minnesota.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/35px-Flag_of_Minnesota.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b9/Flag_of_Minnesota.svg/46px-Flag_of_Minnesota.svg.png 2x" data-file-width="500" data-file-height="318"></a></span> <a href="/w/index.php?title=Mark_Haluptzok&amp;action=edit&amp;redlink=1" class="new" title="Mark Haluptzok (page does not exist)">Mark Haluptzok</a></td><td>4</td><td>0</td></tr><tr bgcolor="#ffffcc"><td><span class="flagicon"><a href="/wiki/Indiana" title="Indiana"><img alt="Indiana" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/23px-Flag_of_Indiana.svg.png" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/35px-Flag_of_Indiana.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/ac/Flag_of_Indiana.svg/45px-Flag_of_Indiana.svg.png 2x" data-file-width="750" data-file-height="500"></a></span> <a href="/w/index.php?title=Greg_Eigner&amp;action=edit&amp;redlink=1" class="new" title="Greg Eigner (page does not exist)">Greg Eigner</a></td><td>3</td><td>1</td></tr><tr bgcolor="#ccffcc"><td><span class="flagicon"><a href="/wiki/New_York_(state)" title="New York (state)"><img alt="New York (state)" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/23px-Flag_of_New_York.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/35px-Flag_of_New_York.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1a/Flag_of_New_York.svg/46px-Flag_of_New_York.svg.png 2x" data-file-width="900" data-file-height="450"></a></span> <a href="/w/index.php?title=Joyance_Meechai&amp;action=edit&amp;redlink=1" class="new" title="Joyance Meechai (page does not exist)">Joyance Meechai</a></td><td>2</td><td>2</td></tr><tr><td><span class="flagicon"><a href="/wiki/Massachusetts" title="Massachusetts"><img alt="Massachusetts" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/23px-Flag_of_Massachusetts.svg.png" width="23" height="14" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/35px-Flag_of_Massachusetts.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f2/Flag_of_Massachusetts.svg/46px-Flag_of_Massachusetts.svg.png 2x" data-file-width="1500" data-file-height="900"></a></span> <a href="/w/index.php?title=Stephen_Dropkin&amp;action=edit&amp;redlink=1" class="new" title="Stephen Dropkin (page does not exist)">Stephen Dropkin</a></td><td>1</td><td>3</td></tr><tr><td><span class="flagicon"><a href="/wiki/Ontario" title="Ontario"><img alt="Ontario" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/23px-Flag_of_Ontario.svg.png" width="23" height="12" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/35px-Flag_of_Ontario.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/88/Flag_of_Ontario.svg/46px-Flag_of_Ontario.svg.png 2x" data-file-width="2400" data-file-height="1200"></a></span> <a href="/w/index.php?title=Gerry_Geurts&amp;action=edit&amp;redlink=1" class="new" title="Gerry Geurts (page does not exist)">Gerry Geurts</a></td><td>0</td><td>4</td></tr></tbody></table></td></tr></tbody></table>"""

        soup = BeautifulSoup(html, 'html.parser')
        tables = readHTML.readTables(soup)
        listt = readHTML.tableTo2d(tables[0])
        table2d = listt[0]
        headers = readHTML.getMainColHeaders(table2d.htmlMatrix)
        self.assertFalse(table2d is None)
        table2d = table2d.toHTML()
        table2dcontent = table2d.replace(" ", "").replace("\n", "")
Exemple #14
0
def createEntityMatrix(file):
    table = readFile(file)
    if table is None or table.htmlMatrix is None:
        return None
    #if table.htmlMatrix is None:
    #    return None
    htmlMatrix = np.array(table.htmlMatrix)

    entityMatrix = htmlMatrix.copy()
    entityMatrix.fill('')
    tablem = []
    #Fill table headers:
    for row in range(0, table.startRows):
        rowm = []
        for col in range(htmlMatrix.shape[1]):
            #entityMatrix[row][col]=htmlMatrix[row][col]
            rowm.append(readHTML.getTableCellText(htmlMatrix[row][col]))
        tablem.append(rowm)
    #Fill table content:

    for row in range(table.startRows, htmlMatrix.shape[0]):
        rowm = []
        for col in range(htmlMatrix.shape[1]):
            resources = extractCellResources(htmlMatrix[row][col])
            entities = set()
            for res in resources:
                entities.add('wd::' + res.id)
            rowm.append(
                list(entities)
            )  #json.dumps(entity.reprJSON(), cls=EntityComplexEncoder, skipkeys=True)
        tablem.append(rowm)
    table.htmlMatrix = tablem
    articleEntity = extractArticleResource(table.articleTitle)
    if articleEntity is not None:
        table.setArticleEntity(extractArticleResource(table.articleTitle).id)

    table.setTableType(table.tableType.value)
    ft = open(
        os.path.join(FOLDER_TABLES_OUT,
                     str(table.tableId.replace(".", "_")) + ".json"), "w")
    ft.write(json.dumps(table.reprJSON(), cls=ComplexEncoder, skipkeys=True))
    ft.close()
Exemple #15
0
def extractLinksGenerator(articleTitle, table):
    out = ""

    tarray = np.array(table.htmlMatrix)
    start = table.startRows
    colHeaders = table.colHeaders
    #colHeaders = ["protag_article@3"]
    #colHeaders.extend(table.colHeaders)
    line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \
           "\t" + str(len(tarray) - table.startRows) + "\t"
    prot = wikiLink(articleTitle)
    pwd = wikidataDAO.getWikidataID(prot)
    if pwd == None:
        pwd = ""
    if len(colHeaders) > 1:
        pairLink = {}
        tlinks = [[[] for x in range(tarray.shape[1])]
                  for y in range(len(tarray) - start)]
        rowLink = 0
        for row in range(start, tarray.shape[0]):
            for col in range(tarray.shape[1]):
                contentA = tarray[row][col]
                bscell = BeautifulSoup(contentA, "html.parser")
                linksCell = readHTML.readTableCellLinks(bscell)
                tlinks[rowLink][col] = linksCell
            rowLink += 1
        write = False

        dictRelByTable = {}
        for i in range(len(tlinks[0])):
            nameCol2 = colHeaders[i]
            dictRelCount = {}
            for row in range(len(tlinks)):
                linksR = tlinks[row][i]
                pos = str(start) + ":" + str(row + start) + ":" + str(
                    -1) + ":" + str(i)
                if len(linksR) == 0:
                    continue
                else:
                    for link in linksR:
                        _link = wikiLink(link)
                        if _link != None and _link != "" and _link != prot:
                            wd = wikidataDAO.getWikidataID(_link)
                            if wd == None:
                                wd = ""
                            props = []
                            if pwd != "" and wd != "":
                                props = wikidataDAO.getRelations(pwd, wd)
                            if len(props) > 0:
                                for p in props:
                                    v = dictRelCount.get(p)
                                    if v == None:
                                        dictRelCount[p] = 1
                                    else:
                                        dictRelCount[p] += 1
                                yield {
                                    cols: "protag_article@3##" + nameCol2,
                                    entity1: prot + " :" + pwd,
                                    entity2: _link + " :" + wd,
                                    relations: props
                                }
            dictRelByTable['protag_article@3##' + nameCol2] = dictRelCount
        for i in range(len(tlinks[0])):
            for j in range(i + 1, len(tlinks[0])):
                nameCol1 = colHeaders[i]
                nameCol2 = colHeaders[j]
                dictRelCount = {}
                for row in range(len(tlinks)):
                    pos = str(start) + ":" + str(row + start) + ":" + str(
                        i) + ":" + str(j)
                    linksL = tlinks[row][i]
                    linksR = tlinks[row][j]
                    if set(linksL) == set(linksR):
                        continue
                    if len(linksL) == 0 or len(linksR) == 0:
                        continue
                    for ll in linksL:
                        for lr in linksR:
                            lla = wikiLink(ll)
                            llb = wikiLink(lr)
                            if lla != "" and llb != "" and lla != llb:
                                wd1 = wikidataDAO.getWikidataID(lla)
                                if wd1 == None:
                                    wd1 = ""
                                wd2 = wikidataDAO.getWikidataID(llb)
                                if wd2 == None:
                                    wd2 = ""
                                props = []
                                if wd1 != "" and wd2 != "":
                                    props = wikidataDAO.getRelations(wd1, wd2)
                                if len(props) > 0:
                                    for p in props:
                                        v = dictRelCount.get(p)
                                        if v == None:
                                            dictRelCount[p] = 1
                                        else:
                                            dictRelCount[p] += 1
                                    yield {
                                        cols: "protag_article@3##" + nameCol2,
                                        entity1: lla + " :" + wd1,
                                        entity2: llb + " :" + wd2,
                                        relations: props
                                    }
                dictRelByTable[nameCol1 + '##' + nameCol2] = dictRelCount
        return out, dictRelByTable
Exemple #16
0
    def testHeadersMix(self):
        html="""
        <table class="wikitable sortable jquery-tablesorter">
<caption><big>Land surface elevation extremes by country</big><br><br>
</caption>
<thead><tr>
<th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Country or region
</th>
<th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Highest point
</th>
<th width="84px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Maximum elevation
</th>
<th width="256px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Lowest point
</th>
<th width="84px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Minimum elevation
</th>
<th width="70px" class="headerSort" tabindex="0" role="columnheader button" title="Sort ascending">Elevation span
</th></tr></thead><tbody>
<tr>
<td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" decoding="async" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" data-file-width="900" data-file-height="600">&nbsp;</span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a>
</td>
<td><a href="/wiki/Noshaq" title="Noshaq">Noshaq</a>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003749200000000000♠"></span>7492&nbsp;m<br>24,580&nbsp;ft
</td>
<td><a href="/wiki/Amu_Darya" title="Amu Darya">Amu Darya</a>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7002258000000000000♠"></span>258&nbsp;m<br>846&nbsp;ft
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003723400000000000♠"></span>7234&nbsp;m<br>23,734&nbsp;ft
</td></tr>
<tr>
<td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/21px-Flag_of_Albania.svg.png" decoding="async" width="21" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/32px-Flag_of_Albania.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/36/Flag_of_Albania.svg/42px-Flag_of_Albania.svg.png 2x" data-file-width="1000" data-file-height="714">&nbsp;</span><a href="/wiki/Albania" title="Albania">Albania</a>
</td>
<td><a href="/wiki/Korab_(mountain)" title="Korab (mountain)">Korab</a>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003276400000000000♠"></span>2764&nbsp;m<br>9,068&nbsp;ft
</td>
<td><a href="/wiki/Adriatic_Sea" title="Adriatic Sea">Adriatic Sea</a>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="5000000000000000000♠"></span>sea level
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003276400000000000♠"></span>2764&nbsp;m<br>9,068&nbsp;ft
</td></tr>
<tr>
<td><span class="flagicon"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/23px-Flag_of_Algeria.svg.png" decoding="async" width="23" height="15" class="thumbborder" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/35px-Flag_of_Algeria.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/77/Flag_of_Algeria.svg/45px-Flag_of_Algeria.svg.png 2x" data-file-width="900" data-file-height="600">&nbsp;</span><a href="/wiki/Algeria" title="Algeria">Algeria</a>
</td>
<td><a href="/wiki/Mount_Tahat" title="Mount Tahat">Mount Tahat</a>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003300300000000000♠"></span>3003&nbsp;m<br>9,852&nbsp;ft
</td>
<td><a href="/wiki/Chott_Melrhir" title="Chott Melrhir">Chott Melrhir</a>
</td>
<td rowspan="1" align="center"><span style="display:none" class="sortkey">2998600000000000000♠</span><span style="color:red">−40&nbsp;m<br>−131&nbsp;ft</span>
</td>
<td rowspan="1" align="center"><span style="display:none" data-sort-value="7003304300000000000♠"></span>3043&nbsp;m<br>9,984&nbsp;ft
</td></tr>
</tbody><tfoot></tfoot></table>
"""
        soup = BeautifulSoup(html, 'html.parser')
        tables = readHTML.readTables(soup)
        listt = readHTML.tableTo2d(tables[0])
        table2d = listt[0]
        print(table2d.toHTML())
        print(table2d.nrows)
        assert len(listt)==1
        headers = readHTML.getMainColHeaders(table2d.htmlMatrix)
        print(headers)
Exemple #17
0
def updateJsonFile(fileName):
    fileNameSplit = fileName.split("/")
    file, file_extension = os.path.splitext(fileNameSplit[len(fileNameSplit) -
                                                          1])
    if "json" not in file_extension:
        return
    jsonFile = open(fileName, "r")
    obj = jsonFile.read()

    #stemmer = SnowballStemmer("english")
    try:
        obj = json.loads(obj)
        article = ComplexDecoder().default(obj)
        lineTables = ""
        tables2D = []
        out = ""
        for table in article.tables:

            if table.tableType == None or table.tableType.value == "":
                table.setColHeaders([])
                table.setStartRows(0)
                tables2D.append(table)
                continue
            if table.tableType.value != TableType.WELL_FORMED.value:
                table.setTableType(table.tableType.value)
                table.setColHeaders([])
                table.setStartRows(0)
                tables2D.append(table)
                continue
            else:
                try:
                    startRow, headers = readHTML.getMainColHeaders(
                        table.htmlMatrix)
                except Exception as ex:
                    table.setTableType(table.tableType.value)
                    table.setColHeaders([])
                    table.setStartRows(0)
                    tables2D.append(table)
                    continue

                if startRow == 0:
                    table.setTableType(table.tableType.value)
                    table.setColHeaders([])
                    table.setStartRows(startRow)
                    tables2D.append(table)
                    continue
                table.setStartRows(startRow)

                #startRow = int(table.startRows)
                matrix = np.array(table.htmlMatrix)
                listOfLevelHeaders = []
                for i in range(startRow):
                    listOfLevelHeaders.append(matrix[i])
                headersMatch = []
                for row in listOfLevelHeaders:
                    cleanTagHeaders = []
                    for col in range(len(row)):
                        cell = BeautifulSoup(row[col], "html.parser")
                        cell = readHTML.cleanTableCellTag(cell)
                        text = " ".join(
                            [s for s in cell.strings if s.strip('\n ') != ''])
                        text = text.replace("*", "").replace("@", "")
                        cleanTagHeaders.append(text)
                        cleanTagHeaders = [
                            textProcessing.cleanCellHeader(h)
                            for h in cleanTagHeaders
                        ]
                    headersMatch.append(cleanTagHeaders)
                lastRow = headersMatch[len(headersMatch) - 1]
                headersMatch[len(headersMatch) - 1] = [
                    'spancol' if h == '' else h for h in lastRow
                ]
                newHeader = []
                for col in range(len(headersMatch[0])):
                    textCol = headersMatch[0][col]
                    for row in range(1, len(headersMatch)):
                        textCol += "**" + headersMatch[row][col]
                    newHeader.append(textCol)
                newHeader = [re.sub('^\\**', '', h) for h in newHeader]
                if startRow > 1:
                    newHeader = [
                        h[:-2] if h.endswith("**") else h for h in newHeader
                    ]
                newHeader = textProcessing.orderHeaders(newHeader)
                newHeaderType = []
                for i, col in enumerate(newHeader):
                    type = readHTML.getColumnType(i, startRow,
                                                  table.htmlMatrix)
                    newHeaderType.append(newHeader[i] + "@" + str(type))
                table.setColHeaders(newHeaderType)
                table.ncols = len(newHeaderType)
                table.setTableType(table.tableType.value)
                tables2D.append(table)
                try:
                    out = extractLinks2(article.title, table)
                except Exception as ex1:
                    print("Error links extraction: ", table.tableId)
                    traceback.print_exc()

        article.setTables(tables2D)
        #article = Article(articleId=article., title=title, tables=tables2d)
        f = open(FOLDER_OUT + "/" + file + ".json", "w")
        f.write(
            json.dumps(article.reprJSON(), cls=ComplexEncoder, skipkeys=True))
        f.close()

        return out
    except Exception as ex:
        print("Error File: ", file)
        traceback.print_exc()
Exemple #18
0
def extractLinks(articleTitle, table):
    out = ""
    tarray = np.array(table.htmlMatrix)
    start = table.startRows
    colHeaders = table.colHeaders
    #colHeaders = ["protag_article@3"]
    #colHeaders.extend(table.colHeaders)
    line = table.tableId + "\t" + str(colHeaders) + "\t" + str(len(tarray[0])) + \
           "\t" + str(len(tarray) - table.startRows) + "\t"
    prot = wikiLink(articleTitle)
    pwd = wikidataDAO.getWikidataID(prot)
    if pwd == None:
        pwd = ""
    if len(colHeaders) > 1:
        pairLink = {}
        tlinks = [[[] for x in range(tarray.shape[1])]
                  for y in range(len(tarray) - start)]
        rowLink = 0
        for row in range(start, tarray.shape[0]):
            for col in range(tarray.shape[1]):
                contentA = tarray[row][col]
                bscell = BeautifulSoup(contentA, "html.parser")
                linksCell = readHTML.readTableCellLinks(bscell)
                tlinks[rowLink][col] = linksCell
            rowLink += 1
        write = False

        for row in range(len(tlinks)):
            for i in range(len(tlinks[0])):
                nameCol2 = colHeaders[i]
                linksR = tlinks[row][i]
                pos = str(start) + ":" + str(row + start) + ":" + str(
                    -1) + ":" + str(i)
                if len(linksR) == 0:
                    continue
                else:
                    for link in linksR:
                        _link = wikiLink(link)
                        if _link != None and _link != "" and _link != prot:
                            wd = wikidataDAO.getWikidataID(_link)
                            if wd == None:
                                wd = ""
                            props = []
                            if pwd != "" and wd != "":
                                props = wikidataDAO.getRelations(pwd, wd)
                            if len(props) > 0:
                                for p in props:
                                    out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\
                                    pwd+"\t"+wd+"\t"+p+"\n"
                            else:
                                out += line + pos + "\t" + "protag_article@3"+ "\t" + nameCol2 + "\t" + prot + "\t"+ _link + "\t"+\
                                    pwd+"\t"+wd+"\t"+""+"\n"

        for row in range(len(tlinks)):
            for i in range(len(tlinks[0])):
                for j in range(i + 1, len(tlinks[0])):
                    pos = str(start) + ":" + str(row + start) + ":" + str(
                        i) + ":" + str(j)
                    linksL = tlinks[row][i]
                    linksR = tlinks[row][j]
                    if set(linksL) == set(linksR):
                        continue
                    if len(linksL) == 0 or len(linksR) == 0:
                        continue
                    for ll in linksL:
                        for lr in linksR:
                            lla = wikiLink(ll)
                            llb = wikiLink(lr)
                            if lla != "" and llb != "" and lla != llb:
                                wd1 = wikidataDAO.getWikidataID(lla)
                                if wd1 == None:
                                    wd1 = ""
                                wd2 = wikidataDAO.getWikidataID(llb)
                                if wd2 == None:
                                    wd2 = ""
                                props = []
                                if wd1 != "" and wd2 != "":
                                    props = wikidataDAO.getRelations(wd1, wd2)
                                if len(props) > 0:
                                    for p in props:
                                        out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \
                                                       "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2+"\t"+p+"\n"
                                else:
                                    out += line + pos + "\t" + colHeaders[i] + "\t" + colHeaders[j] + \
                                            "\t" + lla + "\t" + llb + "\t" + wd1 + "\t" + wd2 + "\t" + "" + "\n"
        return out
def formatFeatures(content):
    #Extrac format features from cell (content)
    bullets = 0
    resources = 0
    hasFormat = 0
    multipleLine = 0
    try:
        #print(content)
        bsoup = BeautifulSoup(content)
        #print(bsoup)
        if "<td" in content:
            cell = bsoup.find("td")
        else:
            cell = bsoup.find("th")
        #print(cell)
        links = readHTML.readTableCellLinks(cell)
        # count   the    list
        bullets += len(cell.find_all("ul"))
        # count    the    enumerations
        bullets += len(cell.find_all("ol"))
        # count    font    tags
        hasFormat += len(cell.find_all("font"))
        hasFormat += len(cell.find_all("b"))
        hasFormat += len(cell.find_all("i"))
        hasFormat += len(cell.find_all("th"))
        hasFormat += len(cell.find_all("small"))
        # count    multiple - lines
        multipleLine += multipleLine + len(cell.find_all("br"))
        noLinksText = readHTML.getTagTextNoLinks(cell)
        cspan = cell.get('colspan')
        rspan = cell.get('rowspan')
        if cspan != None:
            cspan = 1
        else:
            cspan = 0
        if rspan != None:
            rspan = 1
        else:
            rspan = 0
        cell.attrs = {}
        text = str(cell)
        length = len(text)

        noLinksText = [s for s in noLinksText.strings if s.strip('\n ') != '']
        noLinksText = " ".join(noLinksText)
        if cspan == 1 or rspan == 1:
            hasSpan = 1
        else:
            hasSpan = 0

        return {
            'length': length,
            'bullets': bullets,
            'hasFormat': hasFormat,
            'multipleLine': multipleLine,
            'noLinksText': len(noLinksText),
            "links": len(links),
            "hasSpan": hasSpan
        }
    except Exception as ex:
        raise Exception("Error html cell")
Exemple #20
0
def extractLinksFromColumns(fileName):
    filenamesplit = fileName.split("/")
    file, file_extension = os.path.splitext(filenamesplit[len(filenamesplit) - 1])
    out = ""
    try:
        if file_extension != ".json":
            return

        file = open(fileName, "r")
        obj = file.read()


        obj = json.loads(obj)
        article = ComplexDecoder().default(obj)
        prot=getTableProtagonist(article.title)
        for table in article.tables:
            tarray = np.array(table.htmlMatrix)
            colHeaders = ["protag_article@3"]
            colHeaders.extend(table.colHeaders)
            rowHeaders = table.rowHeaders
            setrH = set(rowHeaders)

            line = table.tableId + "\t" + str(colHeaders) + "\t" +  str(len(table.htmlMatrix[0])) + \
                   "\t" + str(len(table.htmlMatrix)-table.startRows) + "\t"

            if len(colHeaders) > 1:
                setcH = set(colHeaders)
                if len(setcH) == 1 and "spancol" in colHeaders[0]:
                    continue
                pairLink = {}
                start = table.startRows  # dictTableInf["nRows"] - dictTableInf["nRowHeaders"]
                tlinks=[[[] for x in range(tarray.shape[1])] for y in range(len(tarray)-start)]
                rowLink=0
                for row in range(start, tarray.shape[0]):
                    for col in range(tarray.shape[1]):
                        contentA = tarray[row][col]
                        bscell = BeautifulSoup(contentA, "html.parser")
                        linksCell = readHTML.readTableCellLinks(bscell)
                        tlinks[rowLink][col]=linksCell
                    rowLink+=1
                write = False
                for row in range(len(tlinks)):
                    for i in range(len(tlinks[0])):
                            linksR = tlinks[row][i]
                            pos = str(row) + ":" + str(-1) + ":" + str(i)
                            if len(linksR) == 0:
                                continue
                            else:
                                for link in linksR:
                                    _link= wikiLink(link)
                                    if _link is not None and _link!= "" and _link!=prot:
                                        out += line + pos + "\t" + colHeaders[0] + "\t" +colHeaders[i+1] + "\t" + prot + "\t" + _link + "\n"
                                        write=True
                for row in range(len(tlinks)):
                    for i in range(len(tlinks[0])):
                        for j in range(i+1,len(tlinks[0])):
                            pos = str(row) + ":" + str(i) + ":" + str(j)
                            linksL=tlinks[row][i]
                            linksR=tlinks[row][j]
                            if set(linksL)==set(linksR):
                                continue
                            if len(linksL) == 0 or len(linksR) == 0:
                                continue
                            for ll in linksL:
                                for lr in linksR:
                                    lla = wikiLink(ll)
                                    llb = wikiLink(lr)
                                    if lla != "" and llb != "" and lla!=llb:
                                        out += line + pos + "\t" + colHeaders[i+1] + "\t" + colHeaders[j+1] + "\t" + lla + "\t" + llb + "\n"
                                        write=True

                if not write:
                    out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n"
            else:
                if len(setrH) > 0:
                    if len(setrH) == 1 and "spancol" in table.rowHeaders[0]:
                        continue
                    out += line + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\t" + "" + "\n"
    except:
        print("Error file: ", fileName)
        traceback.print_exc()
    return out
Exemple #21
0
def normalizeTables(filename):
    file = filename.split("##$##")[0]
    cont = int(filename.split("##$##")[1])
    print("cont: ", cont)
    try:
        bzFile = bz2.BZ2File(file, "rb")
        soup = BeautifulSoup(bzFile.read(), 'html.parser')
        bzFile.close()
    except:
        print("Error reading file: ", filename)
        return str(cont) + "0\t0\t0\t0\t0\t0\n"
    dictStat = {}
    dictStat[TableType.ILL_FORMED.value] = 0
    dictStat["NO_PROCESSED"] = 0
    dictStat[TableType.WELL_FORMED.value] = 0
    dictStat[TableType.SMALLTABLE.value] = 0
    dictStat[TableType.WITH_INNER_TABLE.value] = 0
    dictStat[TableType.FORMAT_BOX.value] = 0
    try:
        title = readHTML.readTitle(soup)
        tables = readHTML.readTables(soup)
        tables2d = []
        contTables = 1
        formatTables = 0
        for it, t in enumerate(tables):
            try:
                parents = [p.name for p in t.findParents()]
                if t.parent != None and ("th" in parents or "td" in parents
                                         or "tr" in parents):
                    continue
                start = time.time()
                listt2d = readHTML.tableTo2d(t)
                logging.debug("Time reading table: " +
                              str(time.time() - start))
                validTables = []
                if listt2d == None or len(listt2d) == 0:

                    newTable = readHTML.saveIllTable(
                        t, TableType.ILL_FORMED.value)
                    if newTable != None:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                    else:
                        dictStat["NO_PROCESSED"] += 1

                else:
                    if len(listt2d) > 10:
                        validTables.append(newTable)
                        dictStat[TableType.ILL_FORMED.value] += 1
                        continue
                    for t2d in listt2d:
                        if t2d.tableType == TableType.FORMAT_BOX.value:
                            dictStat[TableType.FORMAT_BOX.value] += 1
                            formatTables += 1
                            continue

                        if t2d.tableType == TableType.SMALLTABLE.value:
                            dictStat[TableType.SMALLTABLE.value] += 1
                            continue

                        if t2d.tableType == TableType.ILL_FORMED.value:
                            dictStat[TableType.ILL_FORMED.value] += 1
                            validTables.append(t2d)
                            continue

                        if t2d.tableType == TableType.WITH_INNER_TABLE.value:
                            dictStat[TableType.WITH_INNER_TABLE.value] += 1
                            validTables.append(t2d)
                            continue
                        #print(t2d.toHTML())
                        validTables.append(t2d)
                        dictStat[TableType.WELL_FORMED.value] += 1

                for t2d in validTables:
                    tableId = str(cont) + "." + str(contTables)
                    t2d.setTableId(tableId)
                    tables2d.append(t2d)
                    contTables += 1
            except:
                traceback.print_exc()
                print("Error: ", filename, it)
                continue
        if len(tables2d) > 0:
            article = Article(articleId=str(cont),
                              title=title,
                              tables=tables2d)
            f = open(FOLDER_OUT + "/" + str(cont) + ".json", "w")
            f.write(
                json.dumps(article.reprJSON(),
                           cls=ComplexEncoder,
                           skipkeys=True))
            f.close()
        else:
            if len(tables) == formatTables:
                logging.debug("Format table: " + filename)
            else:
                logging.debug("Error none useful table: " + filename)
        logging.debug(dictStat)
    except:
        traceback.print_exc()
        logging.debug("Error file ", filename)

    return str(cont)+"\t"+  str(dictStat[TableType.ILL_FORMED.value])+"\t"+ \
        str(dictStat["NO_PROCESSED"])+"\t"+ \
        str(dictStat[TableType.WELL_FORMED.value])+"\t"+ \
        str(dictStat[TableType.SMALLTABLE.value])+"\t" + \
        str(dictStat[TableType.WITH_INNER_TABLE.value])+"\t" + \
        str(dictStat[TableType.FORMAT_BOX.value])+"\n"