def __handleParagraph__(self, dstree, dsnode, paragraph): sentences = self.sentence.split(paragraph) prevItem = '' for i, sentence in enumerate(sentences): if i % 2 != 0: prevItem = sentence elif len(sentence) > 0: if len(prevItem) > 0: firstLetter = prevItem[(len(prevItem) - 1)] sentence = firstLetter + sentence prevItem = '' else: firstLetter = sentence[0] if sentence[(len(prevItem) - 1)] != '.': sentence = sentence + '.' if firstLetter == DocumentStructure.SYMBOL_DBLQUOTE: if sentence.count(DocumentStructure.SYMBOL_DBLQUOTE) % 2 != 0: sentence += firstLetter elif firstLetter == DocumentStructure.SYMBOL_SINGLEQUOTE: if sentence.count(DocumentStructure.SYMBOL_SINGLEQUOTE) % 2 != 0: sentence += firstLetter self.__handleTextSentence__(dstree, dsnode, sentence.strip()) DocumentStructure.deleteValue(dsnode)
def __handle_title__(self, xmlnode, dstree, dsnode): # Assumes document node if DocumentStructure.getLevel(dsnode) != DocumentStructure.DOCUMENT: raise Exception, "Unexpected level" DocumentStructure.setNode(dsnode, value=util.parseValueFrom(xmlnode)) return dsnode
def appendToValue(dsnode, value): newValue = DocumentStructure.getValue(dsnode) if value != None: if newValue != None: # Ensure a space is added between tags newValue = newValue + ' ' + value else: newValue = value DocumentStructure.setNode(dsnode, value = newValue)
def handle(self, dstree, dsnode): value = DocumentStructure.getValue(dsnode) if len(value) != 0: #value = self.dot.sub('. ', value) value = self.multispaces.sub(' ', value) value = self.spacepunct1.sub(',', value) value = self.spacepunct2.sub('.', value) value = value.strip() DocumentStructure.setValue(dsnode, value) if util.getLevel(dsnode) == DocumentStructure.PARAGRAPH: self.__handleParagraph__(dstree, dsnode, value.strip())
def __handleTextClause__(self, dstree, dsnode, clause): phrases = re.split(self.phrase, clause.strip()) if len(phrases) == 1: dstree.addNode(dsnode, DocumentStructure.TEXT_CLAUSE, value = clause) else: dschildnode = dstree.addNode(dsnode, DocumentStructure.TEXT_CLAUSE) for phrase in phrases: if phrase != None: phrase = phrase.strip() if len(phrase) > 0: self.__handleTextPhrase__(dstree, dschildnode, phrase.strip()) DocumentStructure.deleteValue(dsnode)
def __handle_caption__(self, xmlnode, dstree, dsnode): dsnode = self.prepareNode(dstree, dsnode, DocumentStructure.PARAGRAPH) value = util.parseValueFrom(xmlnode) if DocumentStructure.getLevel(dsnode) == DocumentStructure.PARAGRAPH: value = util.ensureEndsInPeriod(value) self.abstractHandler.handleReplace(dstree, dsnode, value) return self.handleIndent(dsnode, xmlnode)
def __handle_table__(self, xmlnode, dstree, dsnode): if 'summary' in xmlnode.attrib: value = xmlnode.attrib['summary'] dsnode = self.prepareNode(dstree, dsnode, DocumentStructure.PARAGRAPH) if DocumentStructure.getLevel(dsnode) == DocumentStructure.PARAGRAPH: value = util.ensureEndsInPeriod(value) util.appendToValue(dsnode, ' ' + value + ' ') return self.handleIndent(dsnode, xmlnode)
def __handleTextSentence__(self, dstree, dsnode, sentence): clauses = self.clause.split(sentence.strip()) if len(clauses) == 1: dstree.addNode(dsnode, DocumentStructure.TEXT_SENTENCE, value = sentence) else: dschildnode = dstree.addNode(dsnode, DocumentStructure.TEXT_SENTENCE) prevItem = '' for i, clause in enumerate(clauses): if i % 2 != 0: prevItem = clause else: if len(prevItem) > 0: clause = prevItem + clause prevItem = '' if clause.endswith('.') == False: clause = clause self.__handleTextClause__(dstree, dschildnode, clause) DocumentStructure.deleteValue(dsnode)
def prepareNode(self, dstree, dsnode, level, indent = None): curLevel = util.levelToInt(DocumentStructure.getLevel(dsnode)) levelAbove = util.levelToInt(util.levelAbove(level)) levelDesired = util.levelToInt(level) if curLevel >= levelAbove: while curLevel > levelAbove: newLevel = util.levelBelow(DocumentStructure.getLevel(dsnode)) dsnode = dstree.addNode(dsnode, level = newLevel, indent = indent) curLevel = util.levelToInt(DocumentStructure.getLevel(dsnode)) if level == DocumentStructure.PARAGRAPH: self.preInsertedParagraph = True dsnode = dstree.addNode(dsnode, level = DocumentStructure.PARAGRAPH, indent = indent) else: self.preInsertedParagraph = False elif curLevel < levelAbove: while curLevel < levelDesired: dsnode = dsnode.getparent() curLevel = util.levelToInt(DocumentStructure.getLevel(dsnode)) if level != DocumentStructure.PARAGRAPH: dsnode = dsnode.getparent() DocumentStructure.setIndent(dsnode, indent) return dsnode
def __handle_img__(self, xmlnode, dstree, dsnode): value = '' if 'alt' in xmlnode.attrib: value = xmlnode.attrib['alt'] elif 'title' in xmlnode.attrib: value = xmlnode.attrib['title'] dsnode = self.prepareNode(dstree, dsnode, DocumentStructure.PARAGRAPH) if DocumentStructure.getLevel(dsnode) == DocumentStructure.PARAGRAPH: value = util.ensureEndsInPeriod(value) util.appendToValue(dsnode, ' ' + value + ' ') return self.handleIndent(dsnode, xmlnode)
def handleIndent(self, dsnode, xmlnode): indent = DocumentStructure.getIndent(xmlnode) DocumentStructure.setIndent(dsnode, indent) return dsnode
def getLevel(dsnode): return DocumentStructure.getLevel(dsnode)