Example #1
0
    def loadPaperParagraph(self, p, newDocument, parent_id):
        """
            Creates a paragraph in newDocument, splits the text into sentences,
            creates a sentence object for each
        """
        if p.parent.name == "td":
            # This is not a content paragraph, but the content of a table cell
            return None

        par_text=p.renderContents(encoding=None)
        if re.match(r"(<i>)?proceedings\s+of\s+the\s+.*",par_text,flags=re.IGNORECASE):
            # This is not a content paragraph, we throw it away
            return None

        newPar=newDocument.addParagraph(parent_id)

        try:
            sentences=sentenceSplit(par_text)
        except:
            print("UNICODE ERROR!",par_text)
            sentences=[par_text]

        for s in sentences:
            self.loadPaperSentence(s,newDocument,newPar["id"])
        return newPar
Example #2
0
    def loadJATSParagraph(self, p,newDocument, parent):
        """
            Creates a paragraph in newDocument, splits the text into sentences,
            creates a sentence object for each
        """
        newPar=newDocument.addParagraph(parent)
        par_text=p.renderContents(encoding=None)
        sentences=sentenceSplit(par_text)
        for s in sentences:
            self.loadJATSSentence(s, newDocument, newPar["id"], parent)

        return newPar