def _handle_text(cls, node: etree._Element, do_handle_tail_instead=False): if do_handle_tail_instead: if not node.tail or not node.tail.strip(): return text = node.tail node.tail = '' insert_node = node.getparent() insert_start = insert_node.index(node) + 1 else: if not node.text or not node.text.strip(): return text = node.text.strip() node.text = '' insert_node = node insert_start = 0 word_nodes = cls._str_2_word_nodes(text) # the child nodes all get the classes of the parents. that's used later in postproc for word_node in word_nodes: word_node.attrib[ cls.PARENT_CLASS_ATTRIB_NAME] = insert_node.attrib.get( 'class', '') # set the newly created word nodes as children of the parent node. # for text they go below the current node, at the beginning. # for tail, they get inserted into the current node's parent after the current node. for word_ind, word_node in enumerate(word_nodes): insert_node.insert(word_ind + insert_start, word_node) return
def fix_tail(self, item: etree._Element) -> None: """Fix self-closing elements. Designed only to work with self closing elements after item has just been inserted/appended """ parent = item.getparent() idx = parent.index(item) if idx == 0: # item is the first child element, move the text to after item item.tail = parent.text else: # There are other elements, possibly also text, before this child # element. # Move this element's tail to the previous element (note: .text is # only the text after the last child element, text before that and # surrounding elements are attributes of the elements) item.tail = parent[idx - 1].tail # If this is the last child element, it gets the remaining text. if idx == len(parent) - 1: parent[idx - 1].tail = parent.text
def _add_kobo_spans_to_node( self, node: etree._Element, name: str ) -> etree._Element: # process node only if it is not a comment or a processing instruction if ( node is None or isinstance(node, etree._Comment) or isinstance(node, etree._ProcessingInstruction) ): if node is not None: node.tail = None self.log.debug(f"[{name}] Skipping comment/ProcessingInstruction node") return node # Special case some tags special_tag_match = re.search(r"^(?:\{[^\}]+\})?(\w+)$", node.tag) if special_tag_match: # Skipped tags are just flat out skipped if special_tag_match.group(1) in SKIPPED_TAGS: self.log.debug(f"[{name}] Skipping '{special_tag_match.group(1)}' tag") return node # Special tags get wrapped in a span and their children are ignored if special_tag_match.group(1) in SPECIAL_TAGS: self.log.debug( f"[{name}] Wrapping '{special_tag_match.group(1)}' tag and " + "ignoring children" ) span = etree.Element( f"{{{XHTML_NAMESPACE}}}span", attrib={ "id": f"kobo.{self.paragraph_counter[name]}.1", "class": "koboSpan", }, ) span.append(node) return span # save node content for later node_text = node.text node_children = deepcopy(node.getchildren()) node_attrs = {} for key in list(node.keys()): node_attrs[key] = node.get(key) # reset current node, to start from scratch node.clear() # restore node attributes for key in node_attrs: node.set(key, node_attrs[key]) # the node text is converted to spans if node_text is not None: if not self._append_kobo_spans_from_text(node, node_text, name): # didn't add spans, restore text node.text = node_text else: self.paragraph_counter[name] += 1 # re-add the node children for child in node_children: # save child tail for later child_tail = child.tail child.tail = None node.append(self._add_kobo_spans_to_node(child, name)) # the child tail is converted to spans if child_tail is not None: if not self._append_kobo_spans_from_text(node, child_tail, name): # didn't add spans, restore tail on last child node[-1].tail = child_tail else: self.paragraph_counter[name] += 1 return node