def parse_curlies(self): """ Turn text into ITextNodes, but don't let space end the current ITextNode. Only closing curly brace will end the node parsing. :param feed: list of chars (strings of length 1) """ node = ITextNode() self.feed.pop() # eat first "{" while self.feed: c = self.feed[-1] if c == '{': self.parse_curlies() node.append(new_node) elif c == '}': self.feed.pop() break elif c == '\\': new_node = self.parse_command() node.append(new_node) elif c == '$': self.toggle_math_mode() elif c in one_character_commands: new_node = self.parse_one_character_command() node.append(new_node) else: self.feed.pop() node.append(c) return node
def process(self, text): """ Simpler version of parse, turns values of text elements into INodes (intermediary nodes). Results are ITextNodes that may contain more ITextNodes and ICommandNodes. :param text: string to parse. """ #print('LatexFieldToINode called with "%s"' % text) self.math_mode = False self.node = None if not text: return "" text = text.strip() if not text: return "" self.feed = list(text) self.feed.reverse() nodes = [] while self.feed: feed_progress = len(self.feed) node = self.parse_word() # ensure that we are not stuck in endless loop if len(self.feed) < feed_progress: nodes.append(node) else: self.feed.pop() if len(nodes) == 1: return nodes[0] elif nodes: node = ITextNode(parts=nodes) return node.tidy() else: return ""
def handle_startendtag(self, tag, attrs): if tag == 'br': if self.rows_mode: self.rows.append(self.current) # you may want to start again all currently open tags for the next line self.stack = [] self.current = ITextNode() command = html_to_command[tag] self.current.append(ICommandNode(command))
def continue_on_new_line(stack): new_stack = [ITextNode()] for item in stack: if isinstance(item, ICommandNode): new_command = ICommandNode(item.command) if new_stack: new_stack[-1].append(new_command) new_stack.append(new_command) return new_stack
def parse_command(self): """ Turn text into ICommandNodes. These are best understood as tags, where the tag is the command, and parts of the node are the scope of the tag. Reads a word and stores it as a command, and then depending how the word ends, either ends the command node or starts reading next entries as a nodes inside the ICommandNode. :param feed: list of chars (strings of length 1) """ parts = [] command = '' self.feed.pop() # this is the beginning "\" while self.feed: c = self.feed[-1] if c == '{': new_node = self.parse_curlies() parts.append(new_node) print(repr(new_node)) break elif c == '}': print('odd ending curly') break elif c == '\\': if not command: # this is a line break in latex, '\\'', two backslashes in row. # not two command words self.feed.pop() command += c break else: break elif c == ' ': break #elif c == self.rbracket and not self.math_mode: # break elif c in ['<', '>', '&']: break elif c == '$': self.toggle_math_mode() else: self.feed.pop() command += c if command and command in latex_to_command: command = latex_to_command[command] if command: node = ICommandNode(command=command, parts=parts) return node elif parts: node = ITextNode(parts=parts) return node return ''
class HTMLToINode(HTMLParser): """ Convert HTML to ICommandNodes and ITextNodes to use as field values. Doesn't handle brackets or tree parsing, only fields. """ def error(self, message): print('HTML parse error: ', message) def __init__(self, rows_mode=False): super().__init__(convert_charrefs=True) self.current = ITextNode() self.stack = [] self.rows_mode = rows_mode self.rows = [] def handle_starttag(self, tag, attrs): if tag in html_to_command: command = html_to_command[tag] self.stack.append(self.current) new = ICommandNode(command) self.current.append(new) self.current = new def handle_endtag(self, tag): if self.stack: self.current = self.stack.pop() def handle_startendtag(self, tag, attrs): if tag == 'br': if self.rows_mode: self.rows.append(self.current) # you may want to start again all currently open tags for the next line self.stack = [] self.current = ITextNode() command = html_to_command[tag] self.current.append(ICommandNode(command)) def handle_data(self, data): """ Brackets and other tree defining structures are only found within data objects. If we are entering data to field, just take whatever it is here. :param data: :return: """ self.current.append(data) def process(self, string): self.feed(string) result = self.current self.reset() if self.rows_mode: self.rows.append(result) return self.rows else: return result def reset(self): self.current = ITextNode() self.stack = [] super().reset()
def process(self, doc): def removed(stack, command): for item in reversed(stack): if isinstance(item, ICommandNode) and item.command == command: stack.remove(item) break return stack[-1] def continue_on_new_line(stack): new_stack = [ITextNode()] for item in stack: if isinstance(item, ICommandNode): new_command = ICommandNode(item.command) if new_stack: new_stack[-1].append(new_command) new_stack.append(new_command) return new_stack b = doc.firstBlock() end = doc.end() count = 0 rows = [] while b: result = ITextNode() stack = [result] cf = b.charFormat() #b.blockFormat().alignment(), caps = cf.fontCapitalization() family = cf.fontFamily() italic = cf.fontItalic() overline = cf.fontOverline() strikeout = cf.fontStrikeOut() weight = cf.fontWeight() underline = cf.fontUnderline() vertalign = cf.verticalAlignment() for frange in b.textFormats(): #print('---- block %s ----' % count) #print(b.text()) #print(frange.start, frange.length) cf = frange.format if caps != cf.fontCapitalization(): caps = cf.fontCapitalization() if caps == 3: command = ICommandNode('smallcaps') stack.append(command) result.append(command) result = command #print('cap: ', caps) else: result = removed(stack, 'smallcaps') if family != cf.fontFamily(): family = cf.fontFamily() #command = ICommandNode('paa') #stack.append(command) #result.append(command) #result = command #print('family: ', family) if italic != cf.fontItalic(): italic = cf.fontItalic() if italic: command = ICommandNode('italic') stack.append(command) result.append(command) result = command #print('italic: ', cf.fontItalic()) else: result = removed(stack, 'emph') result = removed(stack, 'italic') if overline != cf.fontOverline(): pass #command = ICommandNode('overline') #stack.append(command) #result.append(command) #result = command #print('overline: ', overline) if strikeout != cf.fontStrikeOut(): strikeout = cf.fontStrikeOut() if strikeout: command = ICommandNode('strikeout') stack.append(command) result.append(command) result = command #print('strikeout: ', strikeout) else: result = removed(stack, 'strikeout') if weight != cf.fontWeight(): weight = cf.fontWeight() if weight > 50: command = ICommandNode('bold') stack.append(command) result.append(command) result = command #print('weight: ', weight) else: result = removed(stack, 'bold') if underline != cf.fontUnderline(): underline = cf.fontUnderline() if underline: command = ICommandNode('underline') stack.append(command) result.append(command) result = command #print('underline: ', underline) else: result = removed(stack, 'underline') if vertalign != cf.verticalAlignment(): if vertalign == 1: result = removed(stack, 'sup') elif vertalign == 2: result = removed(stack, 'sub') vertalign = cf.verticalAlignment() if vertalign == 1: command = ICommandNode('sup') stack.append(command) result.append(command) result = command #print('vertical align: ', vertalign) elif vertalign == 2: command = ICommandNode('sub') stack.append(command) result.append(command) result = command #print('vertical align: ', vertalign) text_piece = b.text()[frange.start:frange.start + frange.length] lines = text_piece.splitlines() if len(lines) > 1: for text_piece in lines: result.append(text_piece) base = stack[0].tidy(keep_node=False) stack = continue_on_new_line(stack) result = stack[-1] rows.append(base) else: result.append(text_piece) base = stack[0].tidy(keep_node=False) if base or b != end: # omit last empty line rows.append(base) if b == end: b = None else: b = b.next() count += 1 # Return one node or one string instead of rows has_nodes = False for row in rows: if isinstance(row, ITextNode): has_nodes = True break if has_nodes: parts = [] for row in rows: parts.append(row) parts.append('\n') return ITextNode(parts=parts) else: return '\n'.join(rows)
def parse_word(self, end_on_space=False, return_rows=False): """ Turn text into ITextNodes. If something special (commands, curlybraces, brackets is found, deal with them by creating new Nodes of specific types :param feed: list of chars (strings of length 1) """ node = ITextNode() rows = [] while self.feed: c = self.feed[-1] if c == '{': new_node = self.parse_curlies() node.append(new_node) elif c == '}': break elif c == '\\': new_node = self.parse_command() if return_rows and isinstance(new_node, ICommandNode) and new_node.command == 'br': # fixme: doesn't handle if some style scope continues across line break rows.append(node) node = ITextNode() else: node.append(new_node) elif c == '$': self.toggle_math_mode() elif c in one_character_commands: new_node = self.parse_one_character_command() node.append(new_node) elif c.isspace() and end_on_space: self.feed.pop() break #elif c == self.rbracket and not self.math_mode: # break #elif c == self.lbracket and not self.math_mode: # break #elif c in ['&', '<', '>'] and False: # self.feed.pop() # node.append(html.escape(c)) else: self.feed.pop() node.append(c) node = node.tidy(keep_node=False) if return_rows: if node: rows.append(node) return rows else: return node
def reset(self): self.current = ITextNode() self.stack = [] super().reset()
def __init__(self, rows_mode=False): super().__init__(convert_charrefs=True) self.current = ITextNode() self.stack = [] self.rows_mode = rows_mode self.rows = []