Example #1
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells):
        delete one 'col' and see if the rectangle is stable (area? comparison; rather angle)
        angle see baseline object or 
        take the longest line in the polygon and compute its angle
            radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y))  
            degrees = radian * 180 / math.pi  
        return degrees  
    #sort by size (reverse)
    #     lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    ## can be restricted to ones with high angle?
    lNewPoly = []
    for i in sorted(dCellCndPerRow.keys()):
        lcellRows = dCellCndPerRow[i]
        # skip first
        ltl = []
        for c in lcellRows[1:]:
            if c.wkt in dTextInCells:
                for tl in dTextInCells[c.wkt]:
                    assert ShapeLoader.node_to_Polygon(tl).is_valid
        rectangle = ShapeLoader.minimum_rotated_rectangle(ltl,
        #         print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area)
        if not rectangle.is_empty:
            if abs(rectangle.bounds[1] -
                   rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] -
                #                 rectangle=scale(rectangle,xfact=1.5)
                cellNd = etree.Element('COL')
                rectangle = scale(rectangle, xfact=2)
                cellNd.set('points', ShapeLoader.getCoordsString(rectangle))

                lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
            lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
        ## get length and compute angle !!!  take the one with more horizontal angle::

#         cellNd = etree.Element('COL')
#         pageNd.append(cellNd)
#         if not rectangle.is_empty:
#             rectangle=scale(rectangle,xfact=2)
#             cellNd.set('points',ShapeLoader.getCoordsString(rectangle))
#     print (len(lClusterPoly),len(lNewPoly))
    return lNewPoly
    def parseDocNodeLabel(self, graph_node, defaultCls=None):
        Parse and set the graph node label and return its class index
        raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
        domnode = graph_node.node
        sXmlLabel = domnode.get(self.sLabelAttr)

        # in case we also deal with column headers
        if self.bColumnHeader and 'CH' == domnode.get("DU_header"):
            sXmlLabel = 'CH'

        sXmlLabel = self.dConverter[sXmlLabel]
        if sXmlLabel is None:
            # special processing for singletons TODO: make it more efficient?
            ptTxt = ShapeLoader.node_to_Polygon(domnode).centroid
            plgCell = ShapeLoader.node_to_Polygon(domnode.getparent())
            plgMiddle = shapely.affinity.scale(plgCell, 1, 0.333, 1,
            if plgMiddle.contains(ptTxt):
                sXmlLabel = "Sm"
                if ptTxt.y < plgCell.centroid.y:
                    sXmlLabel = "St"
                    sXmlLabel = "Sb"
            sLabel = self.dXmlLabel2Label[sXmlLabel]
        except KeyError:
            #             #not a label of interest, can we ignore it?
            #             try:
            #                 self.checkIsIgnored(sXmlLabel)
            #                 sLabel = self.sDefaultLabel
            #                 #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
            #             except:
            raise ValueError("Invalid label '%s'"
                             " (from @%s or @%s) in node %s" %
                             (sXmlLabel, self.sLabelAttr, self.sDefaultLabel,
        # traceln(etree.tostring(domnode), sLabel)
        return sLabel
Example #3
    def addClusterToDom(self, dCluster, bMoveContent=False):
        Add Cluster elements to the Page DOM node
        pageNode = None
        for x, lnidx in dCluster.items():
            if pageNode is None:
                pageNode = self.lNode[lnidx[0]].page.node
                    etree.Comment("Clusters created by the conjugate graph"))

            # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx]
            # Make it robust to bad data...
            lp = []
            for _i in lnidx:
                except ValueError:
            contour = cascaded_union(
                [p if p.is_valid else p.convex_hull for p in lp])
            # print(contour.wkt)
                spoints = ' '.join(
                    "%s,%s" % (int(x[0]), int(x[1]))
                    for x in contour.minimum_rotated_rectangle.exterior.coords)
                    spoints = ' '.join(
                        "%s,%s" % (int(x[0]), int(x[1]))
                        for x in contour.minimum_rotated_rectangle.coords)
                    # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
                    spoints = ""
            #print (spoints)
            ndCluster = PageXml.createPageXmlNode('Cluster')
            # add the space separated list of node ids
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            coords.set('points', spoints)

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:

def getClusterCoords(lElts):
        lp = []
        for e in lElts:
            except ValueError:
        contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ])     
        # print(contour.wkt)
        try:spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.exterior.coords)
            try: spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.coords)
            # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
            except: spoints = ""    
        return spoints
Example #5
def assignTextLinesToCells(lTL, lCells):
        get textlines from ids and best-assign them to the list of dCells (shape)
        # do it at the cc level?
    #     print (lCells)
    #     print ([ShapeLoader.node_to_Polygon(nd) for nd in lTL])
    dCellTL = {}
    for i, txt in enumerate([ShapeLoader.node_to_Polygon(nd) for nd in lTL]):
        if not txt.is_empty:
            cellshape = bestRegionsAssignment(txt, lCells)
            if cellshape:
                except KeyError:
                    dCellTL[cellshape.wkt] = [lTL[i]]
    return dCellTL
Example #6
 def loadClusterNode(cls, ndPage, nd, sAlgo, bComputeShape=True):
     Load a cluster from its XML node
     Compute its shape, if not provided in the XML, as a minimum rotated rectangle       
     name = nd.get("name")
     if name is None:
         name = "%s_%d" % (sAlgo, cls.cnt)
         cls.cnt += 1
         nd.set("name", name)
     setID = set(nd.get("content").split())
     if bool(setID):
             shape = ShapeLoader.node_to_Polygon(nd)
         except IndexError:
             if bComputeShape:
                 shape = cls.computeShape(ndPage, setID)
                 shape = None
         return cls(name, setID, shape)
         return None
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    cnt, cntS = 0, 0
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        cntDoc, cntDocS = 0, 0

        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        # Separators are not under tableRegion... :-/
        lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion')
        loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep]
        for _o in loSep: _o._bConsistent = True
        if not lNdSep:
            traceln("Warning: no separator in %s"%sFilename)
            traceln("%25s  %d separators" % (sFilename, len(lNdSep)))
            lNdTR = MultiPageXml.getChildByName(root ,'TableRegion')
            for ndTR in lNdTR:
                lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell')
                if not lNdCells:
                nbRows = max(int(x.get('row')) for x in lNdCells)
                # build a list of Shapely objects augmented with our own table attributes
                loText = [] #
                for ndCell in lNdCells:
                    minRow = int(ndCell.get('row'))
                    minCol = int(ndCell.get('col'))
                    maxRow = minRow + int(ndCell.get('rowSpan')) - 1
                    maxCol = minCol + int(ndCell.get('colSpan')) - 1
#                     # ignore cell spanning the whole table height
#                     if maxRow >= nbRows:
#                         continue
                    for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'):
                            oText = ShapeLoader.node_to_Polygon(ndText)
                            traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText))
                        # reflecting the textbox as a single point
                        (minx, miny, maxx, maxy) = oText.bounds
                        # is the baseline horizontal or vertical??
                        fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) 
                        if isBaselineHorizontal(ndText):
                            # supposed Horizontal text
                            oText = geom.Point(minx + fDelta  , (maxy + miny)/2.0)
                            ndText.set("Horizontal", "TRUE")

                            ndText.set("Horizontal", "nope")
                            oText = geom.Point((minx + maxx)/2.0  , miny + fDelta)
                        # considering it as a point, using its centroid
                        # does not work well due to loooong texts oText = oText.centroid
                        oText._minRow, oText._minCol = minRow, minCol
                        oText._maxRow, oText._maxCol = maxRow, maxCol
                        if DEBUG: oText._domnd = ndText
                traceln("    TableRegion  %d texts" % (len(loText)))
                if loText:
                    # checking in tun each separator for table-consistency
                    sp = ShapePartition(loText)
                    for oSep in loSep:
                        (minx, miny, maxx, maxy) = oSep.bounds
                        if maxx - minx >= maxy - miny:
                            # supposed Horizontal
                            l = sp.getObjectAboveLine(oSep)
                            if l:
                                maxRowBefore = max(_o._maxRow for _o in l)
                                l = sp.getObjectBelowLine(oSep)
                                if l:
                                    minRowAfter  = min(_o._minRow for _o in l)
                                    if maxRowBefore >= minRowAfter: oSep._bConsistent = False
                            l1 = sp.getObjectOnLeftOfLine(oSep)
                            if l1:
                                maxColBefore = max(_o._maxCol for _o in l1)
                                l2 = sp.getObjectOnRightOfLine(oSep)
                                if l2:
                                    minColAfter  = min(_o._minCol for _o in l2)
                                    if maxColBefore >= minColAfter: 
                                        oSep._bConsistent = False
                                        if DEBUG:
                                            # DEBUG
                                            for o in l1:
                                                if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd))
                                            for o in l2:
                                                if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd))
                # end of TableRegion
            # end of document
            for ndSep, oSep in zip(lNdSep, loSep): 
                if oSep._bConsistent:
                    ndSep.set("DU_Sep", "S")
                    cntDocS += 1
                    ndSep.set("DU_Sep", "I")
                cntDoc += 1
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        traceln('%.2f%% consistent separators - annotation done for %s  --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename))
        del doc
        cnt, cntS = cnt+cntDoc, cntS+cntDocS
    traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
    def loadPageCol(self,
                    funIndex=lambda x: x._du_index):
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by column
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByCol = defaultdict(set)  # sets of object ids, by col
        dsetTableDataByCol = defaultdict(set)  # sets of object ids, by col
        dO = {}

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []

        lCells = MultiPageXml.getChildByName(ndPage, "TableCell")
        maxHeaderRowSpan = computeMaxRowSpan(lCells)
        traceln("   - maxHeaderRowSpan=", maxHeaderRowSpan)
        for ndCell in lCells:
            row, col = int(ndCell.get("row")), int(ndCell.get("col"))
            rowSpan = int(ndCell.get("rowSpan"))
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            lx = [_x for _x, _y in plg.exterior.coords]
            xmin, xmax = min(lx), max(lx)
            plg._row = row
            plg._col = col
            plg._xmin, plg._xmax = xmin, xmax

            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True

                # Baseline as a shapely object
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                # scale the objects, as done when cutting!!
                # useless currently since we make a Point...
                o = shapely.affinity.scale(o, xfact=fRatio, yfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                # NOOO x = ShapeLoader.node_to_Polygon(ndParent).centroid.x
                # we must look for the leftest coordinate
                # NO CHECK FOR COLUMNS


                if (row + rowSpan) > maxHeaderRowSpan:

                i += 1

#         if lOrphanBaselineShape:
#             traceln("    *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape))
#             for o in lOrphanBaselineShape:
#                 bestrow, bestdeltacol = 0, 9999
#                 try:
#                     y = o.y
#                 except:
#                     y = o.centroid.y
#                 for plg in lshapeCell:
#                     if plg._ymin <= y and y <= plg._ymax:
#                         # sounds good
#                         deltacol = abs(o._bad_cell._col - plg._col)
#                         if deltacol == 0:
#                             # same column, ok it is that one
#                             bestrow = plg._row
#                             break
#                         else:
#                             if bestdeltacol > deltacol:
#                                 bestdeltacol = deltacol
#                                 bestrow = plg._row
#                 traceln("\t id=%s misplaced in row=%s instead of row=%s" %(
#                     o._du_nd.getparent().get("id")
#                     , o._bad_cell._row
#                     , bestrow))
#                 dsetTableByCol[bestrow].add(o._du_index)
#                 del o._bad_cell

# and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString

                # scale the objects, as done when cutting!!
                o = shapely.affinity.scale(o, xfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")

                i += 1

        return loBaseline, dsetTableByCol, dsetTableDataByCol, maxHeaderRowSpan
Example #9
def createCellsFromZoneClusters(pageNd, dIds, lZones, lClusters, dEdges):
        lZones: list of zones for columns (ordered left right by construction)
        lClusters: list of cluster NODE!! 
        create cells by zone intersections
        idea! compute 'invariance' of the angle of  the rectangle: if one column less: should be the same angle
           variance for detecting merge: del on column and see if several clusters?
        1 test if rectangle OK
            if all ok and no overlap: create rows and col
        # for overlapping:   -> connected componants for each column
            create alternate rectangles by skipping one cell 
        3 merge if same  #order (at cell level)
        4 split if n connected cells per column
            # how: create alternate rectangles?

    lClusters = sorted(lClusters,
                       key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.y)
    lZones = sorted(lZones,
                    key=lambda x: ShapeLoader.node_to_Polygon(x).centroid.x)

    dClusterWithTL = getClusterTL(pageNd, lClusters, dIds)

    #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    lZonePoly = [ShapeLoader.node_to_Polygon(nd) for nd in lZones]

    ## if one clusterpoly is contained in another: merge

    lClusterPoly = transformMinima2envelop(pageNd, dClusterWithTL)
    #     return 0,0,0

    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))
    lClusterPoly = shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow,

    lClusterPoly = ajustRectangleShapes(pageNd, lClusterPoly)

    # again recompute cell zones
    dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
        lClusterPoly, lZonePoly)
    dTextInCells = {}  #key = cell.wkt
    for i in range(len(lClusters)):
            assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

    # do it first ??
    #check if row is a merge
    lRowtobeDel, lNewRowCluster, lNewClusterTL, lNewClusterCC, lTLtoBeAdded = checkClusteringPerRow(
        pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL, dEdges)
    if lNewRowCluster:
        lClusterPoly = [
            ShapeLoader.node_to_Polygon(nd) for i, nd in enumerate(lClusters)
            if i not in lRowtobeDel
        dClusterWithTL = getClusterTL(
            [c for i, c in enumerate(lClusters) if i not in lRowtobeDel], dIds)
        for i, ltl in enumerate(lNewClusterTL):
            dClusterWithTL[len(lClusterPoly) + i] = ltl
        #         lClusterPoly= ajustRectangleShapes(pageNd,lClusterPoly)

        dCells, dShapeCells, dCellCndPerRow, dCellCndPerCol = createCellZoneFromIntersection(
            lClusterPoly, lZonePoly)
        dTextInCells = {}  #key = cell.wkt
        for i in range(len(lClusterPoly)):
                assignTextLinesToCells(dClusterWithTL[i], dCellCndPerRow[i]))

        ## populate elements which are not ij the grid/cells : at cc level or at tl level ?
        for i in range(len(lClusterPoly)):
                assignTextLinesToCells(lTLtoBeAdded, dCellCndPerRow[i]))
    ## oversegmentation : test links at cell level if compatible!see 0068

    dUpdateCells, dNewCells = computeProfile(dCells, dTextInCells,
                                             dCellCndPerRow, dCellCndPerCol)


    return dUpdateCells, dNewCells, dShapeCells
    def loadPage(self,
                 funIndex=lambda x: x._du_index,
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by row
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByRow = defaultdict(set)  # sets of object ids, by row

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []
        for ndCell in MultiPageXml.getChildByName(ndPage, "TableCell"):
            row, col = ndCell.get("row"), ndCell.get("col")
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            ly = [_y for _x, _y in plg.exterior.coords]
            ymin, ymax = min(ly), max(ly)
            plg._row = int(row)
            plg._col = int(col)
            plg._ymin, plg._ymax = ymin, ymax

            i0 = i
            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True
                if bIgnoreHeader and ndParent.get("DU_header") == "CH":
                row_lbl = ndParent.get("DU_row")

                # Baseline as a shapely object
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                o._du_index = i
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                #y = o.centroid.y # NOO!!
                y = ShapeLoader.node_to_Polygon(ndParent).centroid.y
                # if ymin <= y and y <= ymax:
                # we allow the content of a cell to overlap the cell lower border
                if ymin <= y:
                    # this is an orphan!
                    o._bad_cell = plg

                i += 1

            if bIgnoreHeader and i0 == i:
                continue  # empty cells, certainly due to headers, ignore it.

            # end for

        if lOrphanBaselineShape:
                "    *** error: %d Baseline in incorrect row - fixing this..."
                % len(lOrphanBaselineShape))
            for o in lOrphanBaselineShape:
                bestrow, bestdeltacol = 0, 9999
                    y = o.y
                    y = o.centroid.y
                for plg in lshapeCell:
                    if plg._ymin <= y and y <= plg._ymax:
                        # sounds good
                        deltacol = abs(o._bad_cell._col - plg._col)
                        if deltacol == 0:
                            # same column, ok it is that one
                            bestrow = plg._row
                            if bestdeltacol > deltacol:
                                bestdeltacol = deltacol
                                bestrow = plg._row
                traceln("\t id=%s misplaced in row=%s instead of row=%s" %
                        (o._du_nd.getparent().get("id"), o._bad_cell._row,
                del o._bad_cell

        # and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)
                # -> TextLine -> TableCell (possibly)
                ndPrnt = nd.getparent()
                row_lbl = ndPrnt.get("DU_row")

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString
                o._du_index = i
                o._du_row = None  # Must be None
                o._du_DU_row = row_lbl  # can be None
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")


                i += 1

        return loBaseline, dsetTableByRow
def project_Elt_to_GT(gtdoc, doc
                      , xpElement1, xpElement2
                      , xpArea2
                      , bSep, lsRmId, bEval
                      , fTH=0.5):
    Here we take the element out of the production file to put them in the GT
    WE IGNORE xpArea1 (no need for it)
    We return the GT doc
    gtroot = gtdoc.getroot()

    # Evaluation
    # we build a table of list of TextLineId from the GT to check this SW
    # table_id -> row -> col -> list of element id
    dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
    nOk, nTot = 0, 0
    if lsRmId:
        nbEltRemoved = 0
        for sRmId in lsRmId:
            # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS):
            for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId):
                nbEltRemoved += 1
        trace(" (Rm by ID: %d elements removed)" % nbEltRemoved)

    # remove all elements of interest from GT
    # inside TableRegion, we have TextLine, outside we have TextRegion
    if xpElement1 != xpArea2:
        for ndElt in gtroot.xpath(xpElement1, namespaces=dNS):
            if bEval:
                for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS):
    for ndElt in gtroot.xpath(xpElement2, namespaces=dNS):
        ndCell = ndElt.getparent()
        if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) 
    if bEval: traceln("\npEvaluation mode")
    if bSep:
        nbSepRemoved, nbSepAdded = 0, 0
        for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS):
            nbSepRemoved += 1
        trace(" (Separators: %d removed" % nbSepRemoved)
    # project the GT areas, page by page
    lNdPage   = doc.getroot().xpath("//pg:Page", namespaces=dNS)
    lNdPageGT =        gtroot.xpath("//pg:Page", namespaces=dNS)
    if len(lNdPage) != len(lNdPageGT):
        raise GTProjectionException("GT and input have different numbers of pages")
    assert len(lNdPage) > 0, "No page??"

    uniqID = 1
    nNdArea2 = 0
    for ndPage, ndPageGT in zip(lNdPage, lNdPageGT):
        lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS)
        loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2]
        nNdArea2 += len(lNdArea2)
        for ndElt in ndPage.xpath(xpElement2, namespaces=dNS):
            oElt = ShapeLoader.node_to_Polygon(ndElt)
            lOvrl = [oElt.intersection(o).area for o in loArea2]
            iMax = argmax(lOvrl) if lOvrl else None
            vMax = -1 if iMax is None else lOvrl[iMax]
            # where to add it?
            if vMax > 0 and vMax / oElt.area > fTH:
                # ok, this is a match
                ndCell = lNdArea2[iMax]
                # add it directly to the area2 (TableCell)
                if bEval:
                    if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: 
                        nOk += 1
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
                # add it outside of any area
                bestNd = ndPageGT
                # add it in its own TextRegion
                ndTR = etree.Element("TextRegion")
                ndTR.set("id", "prjct_region_%d" % uniqID)
                uniqID += 1
                ndTR.set("custom", "")
                ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0]))
                if bEval:
                    if ndElt.get("id") in dTable[None][None][None]: 
                        nOk += 1
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))                        
            nTot += 1
        if bSep:
            for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS):
                nbSepAdded += 1
    if bSep: trace(", %d added.)  " % nbSepAdded)
    if bEval:
        trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001)))

    if nNdArea2 == 0: raise ProjectException("Empty GT")
    return gtdoc