def handle_textelem(element, potential_tags): '''Process text element and determine how to deal with its content''' new_element = None removing_data = None # bypass: nested elements if element.tag == 'list': new_element = handle_lists(element) elif element.tag == 'quote': # + 'code'? new_element = handle_quotes(element) elif element.tag == 'head': new_element = handle_titles(element) elif element.tag == 'p': new_element = handle_paragraphs(element, potential_tags) elif element.tag == 'lb': if element.tail is not None and not element.tail.isspace(): new_element = etree.Element('p') new_element.text = process_node(element).tail # new_element.text = handle_textnode(element, comments_fix=False).tail elif element.tag == 'hi': new_element = handle_formatting(element) elif element.tag == 'table' and 'table' in potential_tags: new_element = handle_table(element) else: # other elements (div, ??, ??) new_element = handle_other_elements(element, potential_tags) return new_element
def handle_titles(element): '''Process head elements (titles)''' # maybe needs attention if element.tail and re.search(r'\w', element.tail): LOGGER.debug('tail in title, stripping: %s', element.tail) element.tail = None title = process_node(element) if title is not None and title.text and re.search(r'\w', title.text): return title return None
def handle_table(table_elem): '''Process single table element''' #print("inside table:") newtable = etree.Element('table') #print(newtable) newrow = etree.Element('row') i = 0 # explore sub-elements for subelement in table_elem.iter(): print("newrow:", subelement.tag) i += 1 if subelement.tag in ('p'): subelement.tag = 'td' print(subelement) if subelement.tag in ('item', 'list', 'a'): subelement.tag = 'td' if subelement.tag == 'tr': # process existing row if len(newrow) > 0: newtable.append(newrow) newrow = etree.Element('row') # skip rows empty of text #textcontent = ''.join(subelement.itertext()) #if len(textcontent) == 0 or not re.search(r'[p{L}]+', textcontent): #continue elif subelement.tag in ('td', 'p', 'th', 'list', 'item', 'a'): # process processed_cell = process_node(subelement) if processed_cell is None or processed_cell.text is None or len( processed_cell.text) < 1: continue # define tag newsub = etree.SubElement(newrow, 'cell') if subelement.tag == 'th': newsub.set('role', 'head') newsub.text = processed_cell.text #newrow.append(newsub) # beware of nested tables elif subelement.tag == 'table' and i > 1: break # end of processing if len(newrow) > 0: newtable.append(newrow) if len(newtable) > 0: return newtable return None
def handle_quotes(element): '''Process quotes elements''' processed_element = etree.Element(element.tag) for child in element.iter(): processed_child = process_node( child) # handle_textnode(child, comments_fix=True) if processed_child is not None: # processed_element.append(deepcopy(processed_child)) newsub = etree.SubElement(processed_element, child.tag) newsub.text = processed_child.text newsub.tail = processed_child.tail child.tag = 'done' if len(processed_element) > 0: # avoid double/nested tags etree.strip_tags(processed_element, 'quote') # test if it has text # teststring = ''.join(processed_element.itertext()) # if len(teststring) > 0 and re.search(r'[p{L}]', teststring): return processed_element return None
def handle_lists(element): '''Process lists elements''' processed_element = etree.Element(element.tag) for child in element.iter('item'): newchildelem = etree.Element('item') if len(child) == 0: processed_child = process_node(child) if processed_child is not None: # processed_element.append(deepcopy(processed_child)) # childelem = etree.SubElement(processed_element, processed_child.tag) newchildelem.text = processed_child.text newchildelem.tail = processed_child.tail processed_element.append(newchildelem) else: # print(child.tag, child.text, child.tail) # proceed with iteration, fix for nested elements for subelem in child.iter(): # newsub = etree.Element('item') processed_subchild = handle_textnode( subelem, comments_fix=False) # process_node(subelem) # add child element to processed_element if processed_subchild is not None: subchildelem = etree.SubElement(newchildelem, processed_subchild.tag) subchildelem.text = processed_subchild.text subchildelem.tail = processed_subchild.tail # newsub.append(deepcopy(processed_subchild)) # processed_element.append(processed_subchild) subelem.tag = 'done' etree.strip_tags(newchildelem, 'item') if newchildelem.text or len(newchildelem) > 0: processed_element.append(newchildelem) child.tag = 'done' # avoid double tags?? if len(processed_element) > 0: # if it has children # test if it has text teststring = ''.join(processed_element.itertext()) if len(teststring) > 0 and re.search(r'\S', teststring): return processed_element return None
def handle_paragraphs(element, potential_tags): '''Process paragraphs (p) elements along with their children, trim and clean the content''' element.attrib.clear() # etree.strip_tags(element, 'p') # no children if len(element) == 0: processed_element = process_node( element) # handle_textnode(element, comments_fix=False) if processed_element is not None: return processed_element return None # children processed_element = etree.Element(element.tag) for child in element.iter(): if child.tag not in potential_tags: LOGGER.debug('unexpected elem in paragraph: %s %s %s', child.tag, child.text, child.tail) continue processed_child = handle_textnode(child, comments_fix=False) if processed_child is not None: # needing attention! if child.tag == 'p': LOGGER.debug('extra elem within p: %s %s %s', child.tag, child.text, child.tail) processed_element.text = ' ' + child.text processed_element.text = trim(processed_element.text) continue newsub = etree.Element(child.tag) # handle formatting if child.tag == 'hi': # check depth and clean if len(child) > 0: for item in child: # children are lists if item.text is not None and not item.text.isspace(): item.text = ' ' + item.text etree.strip_tags(child, item.tag) newsub.set('rend', child.get('rend')) # handle line breaks elif child.tag == 'lb': #processed_child.tail = handle_textnode(child, comments_fix=False).tail try: processed_child.tail = process_node(child).tail except AttributeError: # no text pass # prepare text if processed_child.text is None or processed_child.text.isspace(): processed_child.text = '' # if there are already children if len(processed_element) > 0: if processed_child.tail is not None and not processed_child.tail.isspace( ): newsub.tail = processed_child.text + processed_child.tail else: newsub.tail = processed_child.text else: newsub.text = processed_child.text newsub.tail = processed_child.tail processed_element.append(newsub) # print(html.tostring(processed_element)) child.tag = 'done' # finish if len(processed_element) > 0 or processed_element.text: # clean trailing lb-elements if len(processed_element) > 0 and processed_element[ -1].tag == 'lb' and processed_element[-1].tail is None: processed_element[-1].getparent().remove(processed_element[-1]) return processed_element LOGGER.debug('discarding p-child: %s', html.tostring(processed_element)) return None