def minifyResult(self): if not self._result: return self._result self._result = DomUtils.minifyDom(self._result) return self._result
def _postAnalyzeImpl(self): self.trace('Analysis complete. Beginning processing...') #------------------------------------------------------------------------------------------- # CLEANUP HTML WHITESPACE # For html already in the page, either explicit or created by the preprocessor remove # the newlines between tags to prevent them from being viewed as line breaks during # Markup rendering. self._result = self._removeHTMLTagWhitespace(self._raw.replace(u'\r', u' ')) self.trace('CLEANED -> preliminary HTML whitespace', showResult=True) tags = self._createTags() #------------------------------------------------------------------------------------------- # REMOVE UNWANTED LINE BREAKS # Newlines that occur between close and open Markup tags should be removed. res = MarkupProcessor._STRIP_WHITESPACE_PATTERN.finditer(self._result) if res: for r in res: start = r.start() end = r.end() length = end - start self._result = self._result[:start] + u' '*length + self._result[end:] res = MarkupProcessor._CONTINUE_LINE_PATTERN.finditer(self._result) if res: for r in res: start = r.start() end = r.end() length = end - start replace = u' '*length self._result = self._result[:start] + replace + self._result[end:] res = MarkupProcessor._UNWANTED_NEWLINE_PATTERN.finditer(self._result) if res: for r in res: start = r.start() end = r.end() # Skip line breaks inside of leaf tags. skip = False startTag = None endTag = None for t in tags: if not startTag: c = t.findContainingBlock(start) if c and c.isLeafTag: skip = True break startTag = c if not endTag: endTag = t.findContainingBlock(end) if startTag and endTag: break if skip: continue # Skip tags that are not display:block or voids skip = (startTag and not (startTag.isBlockDisplay or startTag.isVoidTag)) or \ (endTag and not (endTag.isBlockDisplay or endTag.isVoidTag)) if skip: continue replace = r.group('close') + r.group('space').replace(u'\n', u' ') + r.group('open') self._result = self._result[:start] + replace + self._result[end:] self.trace('Line breaks cleaned up; Starting Tag rendering...') #------------------------------------------------------------------------------------------- # RENDER RESULT for b in self._blocks: if b.blockType == BlockSyntaxEnum.COMMENT: start = b.start while start - 1 > 0 and self._result[start-1] in [u'\r', u'\n', u'\t']: start -= 1 end = b.end while end < len(self._result) and self._result[end] in [u'\r', u'\n', u'\t']: end += 1 length = end - start if length > 0: self._result = self._result[:start] + u' '*length + self._result[end:] #------------------------------------------------------------------------------------------- # EXECUTE THE MULTI-STAGE RENDERING PROCESS self._executeTagActions(tags, 'redefinitionTraversal', 'Redefinition Traversal Failure') self._executeTagActions(tags, 'redefinitionReversal', 'Redefinition Reversal Failure') self._executeTagActions(tags, 'connectivityTraversal', 'Connectivity Traversal Failure') self._executeTagActions(tags, 'connectivityReversal', 'Connectivity Reversal Failure') self.trace(self.traceHierarchy('Global pre-render complete:', tags)) self._executeTagActions(tags, 'render', 'Render Failure') self._executeTagActions(tags, 'cleanupTraversal', 'Cleanup Traversal Failure') self._executeTagActions(tags, 'cleanupReversal', 'Cleanup Render Failure') self.trace('RENDERED RESULT -> Markup converted to HTML', showResult=True) #------------------------------------------------------------------------------------------- # POST-PROCESSING HTML WHITESPACE CLEANUP self._result = self._removeHTMLTagWhitespace(self._result.replace(u'\r', u' ')) self.trace('CLEANED -> post HTML whitespace', showResult=True) #------------------------------------------------------------------------------------------- # STRIP MARKUP res = MarkupProcessor._STRIP_PATTERN.finditer(self._result) offset = 0 for r in res: s = r.start() + offset e = r.end() + offset offset += self.insertCharacters(s, e, u'') self.trace('STRIPPED -> markup', showResult=True) #------------------------------------------------------------------------------------------- # HANDLE NEWLINES NOT IN TAGS offset = 0 tagIndex = 0 preserveBlocks = DomUtils.getPreserveBlocks(self._result) while True: index = self._result.find('\n', offset) if index == -1: break # Ignore line breaks inside preserved tags skip = False for b in preserveBlocks: if b.start <= index <= b.end: offset = b.end + 1 skip = True break if skip: continue while tagIndex < len(tags): if index > tags[tagIndex].end(): tagIndex += 1 continue break if tagIndex < len(tags) and tags[tagIndex].contains(index): offset = tags[tagIndex].end() tagIndex += 1 continue offset = index + self.insertCharacters(index, index+1, u'<br />') self.trace('ADDED breaks to newlines not in tags', showResult=True) #------------------------------------------------------------------------------------------- # REMOVE BR DIV COMBOS res = MarkupProcessor._DIV_LINE_BREAK_PATTERN.finditer(self._result) offset = 0 for r in res: s = r.start() + offset e = r.end() + offset offset += self.insertCharacters(s, e, u'') self._result = self._result.replace(u'[*', u'[').replace(u'*]', u']') self._tags = tags #------------------------------------------------------------------------------------------- # FINAL MODIFICATIONS TO THE RENDERED DOM self._result = self._modifyResult(self._result) #------------------------------------------------------------------------------------------- # CLEANUP WHITE SPACE self._result = DomUtils.minifyDom(self._result) self.trace('Post processing complete.')
def _removeHTMLTagWhitespace(self, source): preserveBlocks = DomUtils.getPreserveBlocks(source) res = MarkupProcessor._HTML_INSIDE_TAG_PATTERN.finditer(source) if res: for r in res: start = r.start() end = r.end() replace = r.group('tag').replace(u'\n', u' ') source = source[:start] + replace + source[end:] res = MarkupProcessor._HTML_PRE_TAG_WHITESPACE_PATTERN.finditer(source) if res: for r in res: start = r.start() end = r.end() tag = r.group('tag') preSource = source[:start] if StringUtils.begins(tag, (u'<span', u'<a')): strippedPreSource = preSource.strip() # Preserve lines between span tags if StringUtils.ends(strippedPreSource, (u'</span>', u'</a>')): continue # Preserve lines between span tags and non-html entities like text if not strippedPreSource.endswith(u'>'): continue skip = False for b in preserveBlocks: if b.start <= start <= b.end or b.start <= end <= b.end: skip = True break if skip: continue length = len(r.group('whitespace')) replace = u' '*length + tag source = preSource + replace + source[end:] res = MarkupProcessor._HTML_POST_TAG_WHITESPACE_PATTERN.finditer(source) if res: for r in res: start = r.start() end = r.end() tag = r.group('tag') postSource = source[end:] if tag in (u'</span>', u'</a>'): strippedPostSource = postSource.strip() # Preserve lines between span tags if StringUtils.begins(strippedPostSource, (u'<span', u'<a')): continue # Preserve lines between span tags and non-html entities like text if not strippedPostSource.startswith(u'<'): continue skip = False for b in preserveBlocks: if b.start <= start <= b.end or b.start <= end <= b.end: skip = True break if skip: continue length = len(r.group('whitespace')) replace = tag + u' '*length source = source[:start] + replace + postSource return source