def isTexted(sFilename, fRatio): parser = etree.XMLParser(remove_blank_text=True) doc = etree.parse(sFilename, parser) cntTxt, cnt = PageXml.countTextLineWithText(doc) fDocRatio = float(cntTxt) / cnt del doc if fDocRatio > fRatio: return True elif fDocRatio > 0: traceln("Warning: %d texted out of %d (%.2f) %s" % (cntTxt, cnt, fDocRatio, sFilename)) return False
def test_countTextLineWithText(): sXml = b"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?> <PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> <Metadata> <Creator>Tilla</Creator> <Created>2016-08-18T13:35:08.252+07:00</Created> <LastChange>2016-12-01T09:53:39.610+01:00</LastChange> </Metadata> <Page imageFilename="MM_1_001_001.jpg" imageWidth="1277" imageHeight="3518" type="other"> <TextRegion id="region_1502087153356_21" custom="readingOrder {index:0;}"> <Coords points="503,75 705,75 705,195 503,195"/> <TextLine id="line_1502089038759_357" custom="readingOrder {index:0;}" DU_row="O" DU_col="O" DU_header="O"> <Coords points="545,131 679,131 679,181 545,181"/> <Baseline points="545,176 679,176"/> <TextEquiv> <Unicode>52.</Unicode> </TextEquiv> </TextLine> <TextEquiv> <Unicode/> </TextEquiv> </TextRegion> <TextRegion id="region_1502087156278_22" custom="readingOrder {index:1;}"> <Coords points="2267,48 2832,48 2832,192 2267,192"/> <TextLine id="line_1502089042728_358" custom="readingOrder {index:0;}" DU_row="O" DU_col="O" DU_header="O"> <Coords points="2307,110 2817,112 2817,162 2307,160"/> <Baseline points="2307,155 2817,157"/> <TextEquiv> <Unicode></Unicode> </TextEquiv> </TextLine> <TextEquiv> <Unicode/> </TextEquiv> </TextRegion> </Page> </PcGts>""" doc = etree.parse(BytesIO(sXml)) assert (1, 2) == PageXml.countTextLineWithText(doc) return doc