def loadPaperParagraph(self, p, newDocument, parent_id): """ Creates a paragraph in newDocument, splits the text into sentences, creates a sentence object for each """ if p.parent.name == "td": # This is not a content paragraph, but the content of a table cell return None par_text=p.renderContents(encoding=None) if re.match(r"(<i>)?proceedings\s+of\s+the\s+.*",par_text,flags=re.IGNORECASE): # This is not a content paragraph, we throw it away return None newPar=newDocument.addParagraph(parent_id) try: sentences=sentenceSplit(par_text) except: print("UNICODE ERROR!",par_text) sentences=[par_text] for s in sentences: self.loadPaperSentence(s,newDocument,newPar["id"]) return newPar
def loadJATSParagraph(self, p,newDocument, parent): """ Creates a paragraph in newDocument, splits the text into sentences, creates a sentence object for each """ newPar=newDocument.addParagraph(parent) par_text=p.renderContents(encoding=None) sentences=sentenceSplit(par_text) for s in sentences: self.loadJATSSentence(s, newDocument, newPar["id"], parent) return newPar