Example #1
0
    def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None):
        """
        Extract the text of a DOM node
        
        Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context

        return a unicode string
        """
        lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS)
        if len(lNdText) != 1:
            if len(lNdText) > 1:
                raise ValueError(
                    "More than 1 textual content for this node: %s" %
                    etree.tostring(ndBlock))

            #let's try to get th etext of the words, and concatenate...
            # traceln("Warning: no text in node %s => looking at words!"%ndBlock.prop("id"))
            # lsText = [ntext.content.decode('utf-8').strip() for ntext in ctxt.xpathEval('.//pc:Word/pc:TextEquiv//text()')] #if we have both PlainText and UnicodeText in XML, :-/
            lsText = [
                _nd.text.strip()
                for _nd in ctxt.xpathEval('.//pc:Word/pc:TextEquiv')
            ]  #if we have both PlainText and UnicodeText in XML, :-/
            return " ".join(lsText)

        return PageXml.makeText(lNdText[0])
Example #2
0
    def _get_GraphNodeText(self, doc, domNdPage, ndBlock):
        """
        Extract the text of a DOM node
        
        Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context

        return a unicode string
        """    
        lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS)
        if len(lNdText) != 1:
            if len(lNdText) <= 0:
                raise ValueError("I found no useful TextEquiv below this node... \n%s"%etree.tostring(ndBlock))
            else:
                raise ValueError("I expected exactly one useful TextEquiv below this node. Got many... \n%s"%etree.tostring(ndBlock))
        
        return PageXml.makeText(lNdText[0])