def transform(filename): htmlfile = open(filename, encoding='ISO-8859-1') tree = html.parse(htmlfile) for node in tree.xpath('//font'): size = int(node.get('size')) if 'size' in node.attrib else None color = node.get('color').lower() if 'color' in node.attrib else '' if color == '#ff0000': strong = html.Element('strong') replace_tag(node, strong) elif size == 6: h1 = html.Element('h1') replace_tag(node, h1) elif size == 5: h2 = html.Element('h2') replace_tag(node, h2) elif size == 4: node.drop_tag() for node in tree.xpath('//a[@href]'): href = node.get('href') try: basename, extension = href.split('.') except ValueError: continue if extension.startswith('htm'): node.set('href', '{}.{}'.format(basename, 'md')) #for node in tree.xpath('//p[re:test(@align, "^center$", "i")]', namespaces={"re": "http://exslt.org/regular-expressions"}): # node.set('align', None) transformed_html = etree.tostring(tree, pretty_print=True, method='html', encoding='unicode') return transformed_html
def _extract_tr(self, tr: html.Element): " extract a row " #print(f"tr ===>{html.tostring(tr)}<<====\n") elem = html.Element("tr") elem.text = "" elem.tail = "" cells = [] for x in tr: if x.tag != "td" and x.tag != "th": if x.tag == etree.Comment: continue if x.tag == "script": continue logger.warning(f" adding td around {html.tostring(x)}") ch_elem = html.Element("td") bad_elem, val = self._extract_any(x) if bad_elem != None: ch_elem.append(bad_elem) else: ch_elem.text = val else: ch_elem, val = self._extract_any(x) if ch_elem == None: ch_elem = html.Element(x.tag) ch_elem.tail = "" elem.append(ch_elem) cells.append(val) self._new_element.append(elem) self.rows.append(cells)
def clean28(filename, content): html_content = html.fromstring(content) s12s = html_content.xpath('//div[@class=\'s12\']') has_changed = False for s12 in s12s: if ((s12.getchildren() is None or len(s12.getchildren()) == 0) and s12.tail is not None and len(s12.tail.strip()) > 0 and clean28regex1.match(s12.tail.strip())): has_changed = has_changed or True element = html.Element('div', {'class': 's12'}) element.text = s12.tail.strip() s12.tail = '' s12.addnext(element) while (element.getnext() is not None and element.getnext().tag == 'br' and element.getnext().tail is not None and len(element.getnext().tail.strip()) > 0 and clean28regex1.match(element.getnext().tail.strip())): new_element = html.Element('div', {'class': 's12'}) new_element.text = element.getnext().tail.strip() element.getnext().tail = '' element.getnext().drop_tree() element.addnext(new_element) element = new_element if has_changed: print filename content = etree.tostring(html_content) return content
def clean32(filename, content): html_content = html.fromstring(content) centers = html_content.xpath('//center') has_changed = False for center in centers: if (center.getchildren() and center.text and len(center.getchildren()) == 1 and center.getchildren()[0].tag == 'br' and center.getchildren()[0].tail): text1 = center.text.strip() text2 = center.getchildren()[0].tail.strip() match = clean32regex1.match(text1) if (match): c = match.group(1).lower() if c == 'pasal': element = html.Element('h4') else: element = html.Element('h2', {'class': c}) num = html.Element('span', {'class': 'num'}) num.text = text1 heading = html.Element('span', {'class': 'title'}) heading.text = text2 element.append(num) element.append(heading) center.addnext(element) center.drop_tree() has_changed = has_changed or True if has_changed: print filename content = etree.tostring(html_content) return content
def clean10(filename, content): html_content = html.fromstring(content) center_parts = html_content.xpath('//center') has_changed = False for part in center_parts: child = part.getchildren() if (len(child) == 1 and child[0].tag == 'br' and part.text and part.text.strip() and child[0].tail and child[0].tail.strip()): text = part.text.strip() + ': ' + child[0].tail.strip() if (clean10regex1.match(text)): element = html.Element('h2', {'class': 'bagian'}) element.text = text part.addprevious(element) part.drop_tree() has_changed = has_changed or True elif (clean10regex2.match(text)): element = html.Element('h2', {'class': 'bab'}) element.text = text part.addprevious(element) part.drop_tree() has_changed = has_changed or True if has_changed: print filename content = etree.tostring(html_content) return content
def fix_orphan_html_list_items(el): while True: try: li_el = el.xpath('//li[not(parent::ul) and not(parent::ol)]')[0] except IndexError: break else: parent_el = li_el.getparent() i = parent_el.index(li_el) # index of the first <li> subsequent_li_els = list( takewhile(is_li_element, li_el.itersiblings())) if subsequent_li_els: # prepare new children, the first <li> and all subsequent <li> # siblings (anything else than <li> is a stopper) children_els = [li_el] + subsequent_li_els # move <li> elements from the parent to the new <ul> ul_el = html.Element('ul') for child_el in children_els: ul_el.append(child_el) # move tail text from the last <li> to the new <ul> ul_el.tail = children_els[-1].tail children_els[-1].tail = None # put the <ul> at the same index where the first <li> was parent_el.insert(i, ul_el) else: # standalone <li> element, turn it into a <span> with <br> br_el = html.Element('br') li_el.addnext(br_el) li_el.tag = 'span' return el
def _createNumberedElem(paraElem): ''' Uses the attributes tagged onto a HTML paragraph element to create an HTML numbered list. ''' numberTypeMap = { 'decimal': '1', 'lowerLetter': 'a', 'upperLetter': 'A', 'lowerRoman': 'i', 'upperRoman': 'I' } numberedElem = None formatType = paraElem.get('CAR_format') if formatType == 'bullet': numberedElem = HTML.Element('ul') else: numberedElem = HTML.Element('ol') if formatType in numberTypeMap: numberedElem.set('type', numberTypeMap[formatType]) else: # Default to decimal numberedElem.set('type', '1') numberedElem.set('start', paraElem.get('CAR_start')) return numberedElem
def populate_td(input_value, recid=None): '''Populate the <td> elements of the table''' if VERBOSE: print 'input_value =', input_value if recid and isinstance(input_value, str): return make_url(input_value, recid) ul_elem = LH.Element("ul") for spokesperson in input_value: name = spokesperson['name'] name = re.sub(ur' \(.*', '', name) try: name = make_url(name, spokesperson['recid']) except KeyError: pass try: display = LH.Element("li") display.append(name) except TypeError: display = ELEMENT.li(name) dates = ' (' + spokesperson['start'] + ' - ' if spokesperson['curr'].lower() == 'current': dates += 'present)' display.append(ELEMENT.b(dates)) else: dates += spokesperson['end'] + ')' display.append(ELEMENT.a(dates)) ul_elem.append(display) return ul_elem
def add_navbar_js(self): """short desc long desc Args: var (type): desc Returns: desc Raises: IOError: desc """ this_dir, this_filename = os.path.split(__file__) file_path = os.path.join(this_dir, "js", "navbar.js") with open(file_path, "r") as fi: navbar = fi.read() new_script = html.Element("script") new_script.text = navbar self.book.xpath("//head")[0].insert(1, new_script) ## Add jquery library new_script = html.Element("script") new_script.attrib[ "src"] = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js" self.book.xpath("//head")[0].insert(1, new_script)
def clean23(filename, content): html_content = html.fromstring(content) center_parts = html_content.xpath('//center') has_changed = False for center_part in center_parts: if (center_part.getchildren() is not None and len(center_part.getchildren()) == 3 and all( (child.tag == 'br' and child.tail and len(child.tail.strip()) > 0) for child in center_part.getchildren())): children = center_part.getchildren() text1 = center_part.text.strip() text2 = children[0].tail.strip() text3 = children[1].tail.strip() text4 = children[2].tail.strip() match1 = clean23regex1.match(text1) match2 = clean23regex1.match(text3) if (match1 and match2): has_changed = has_changed or True element1 = html.Element('h2', {'class': match1.group(1).lower()}) element1.text = text1 + ': ' + text2 element2 = html.Element('h2', {'class': match2.group(1).lower()}) element2.text = text3 + ': ' + text4 center_part.addprevious(element1) center_part.addprevious(element2) center_part.drop_tree() if has_changed: print filename content = etree.tostring(html_content) return content
def _extract_any(self, x: html.Element) -> [html.Element, str]: " extract/simplify an HTML element (recursive) " #print(f"extract any ===>{html.tostring(x)}<<====\n") # nested tables are special because we are processing a flattend list so ignore them. if x.tag == "table": return html.Element("table"), "[TABLE]" # lists are special because we want to build up a comma seperated list if x.tag == "ul": return self._extract_list(x) if x.tag == etree.Comment: return etree.Comment(), "" # no children --> text element if len(x) == 0: if x.text == None: return None, "" elem, val = x, self._extract_text(x.text) return elem, val elem = html.Element(x.tag) items = [] if x.text != None: elem.text = x.text items.append(x.text) for y in x: #ignore/strip out layout tags if y.tag == etree.Comment: continue if y.tag in ["script", "noscript", "br", "hr", "input", "button", "svg", "img", "form"]: continue if y.tag in ["span", "div", "h3", "h2", "h1", "small", "strong", "em", "sup", "i", "a", "b", "u", "p", "ul", "label", "sub"]: elem_ch, s = self._extract_any(y) if elem_ch != None: if len(x) == 1: if s != None and s != "": elem.text = s else: elem.append(elem_ch) if s != None and s != "": items.append(s) elif y.tag == "table" or y.tag == "iframe": elem.append(html.Element(y.tag)) items.append(f"[{y.tag.upper()}]") else: logger.warning(f"unexpected tag {y.tag} ===>{html.tostring(y)}<<====\n") elem_ch, s = self._extract_any(y) if elem_ch != None: if len(x) == 1: if s != None and s != "": elem.text = s else: elem.append(elem_ch) if s != None and s != "": items.append(s) val = " ".join(items) return elem, val
def _render(self, template, values=None, **options): """ render(template, values, **options) Render the template specified by the given name. :param template: etree, xml_id, template name (see _get_template) * Call the method ``load`` is not an etree. :param dict values: template values to be used for rendering :param options: used to compile the template (the dict available for the rendering is frozen) * ``load`` (function) overrides the load method :returns: bytes marked as markup-safe (decode to :class:`markupsafe.Markup` instead of `str`) :rtype: MarkupSafe """ context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode']) context.update(options) result = super(IrQWeb, self)._render(template, values=values, **context) if not values or not values.get('__keep_empty_lines'): result = markupsafe.Markup( IrQWeb._empty_lines.sub('\n', result.strip())) if 'data-pagebreak=' not in result: return result fragments = html.fragments_fromstring(result) for fragment in fragments: for row in fragment.iterfind('.//tr[@data-pagebreak]'): table = next(row.iterancestors('table')) newtable = html.Element('table', attrib=dict(table.attrib)) thead = table.find('thead') if thead: newtable.append(copy.deepcopy(thead)) # TODO: copy caption & tfoot as well? # TODO: move rows in a tbody if row.getparent() is one? pos = row.get('data-pagebreak') assert pos in ('before', 'after') for sibling in row.getparent().iterchildren('tr'): if sibling is row: if pos == 'after': newtable.append(sibling) break newtable.append(sibling) table.addprevious(newtable) table.addprevious( html.Element('div', attrib={'style': 'page-break-after: always'})) return markupsafe.Markup(''.join( html.tostring(f).decode() for f in fragments))
def __parse_layout(self, xmlObj): ''' Logger [9] - get the parent - get the parent id - get the html_document with the parent_id - get max rows - get max cols - create element of class xmlObj.get('class') - set element id of xmlObj.get('name') - set element x-data-maxcols=max_cols - set element x-data-maxrows=max_rows - loop over from 0 to maxrow +1 - loop over from 0 to maxcol +1 add element of type div with id of objXml.get('name')_row{x}_col{y} class item empty-item - add element to its parent ''' element_parent = xmlObj.getparent() if etree.iselement(element_parent): element_parent_id = element_parent.get('name') html_doc_fragment = self.__html.xpath( '//*[@id="{0}"]'.format(element_parent_id)) if len(html_doc_fragment) > 0: html_doc_fragment = html_doc_fragment[0] max_rows = 0 max_cols = 0 for item in xmlObj.findall('item'): row = int(item.get('row')) col = int(item.get('column')) if col > max_cols: max_cols = col if row > max_rows: max_rows = row html_element = html.Element('div') html_element.set('class', xmlObj.get('class')) html_element.set('id', xmlObj.get('name')) html_element.set('x-data-name', xmlObj.get('name')) html_element.set('x-data-maxcols', str(max_cols)) html_element.set('x-data-maxrows', str(max_rows)) for row in range(max_rows + 1): for col in range(max_cols + 1): item_element = html.Element('div') item_element.set( 'id', "{0}_row{1}_col{2}".format(xmlObj.get('name'), row, col)) item_element.set('class', 'item item-empty') html_element.append(item_element) print(row, 'x', col) print(html.tostring(html_element)) html_doc_fragment.append(html_element) self.Logger.info('[9] Added Layout {0}'.format( html.tostring(html_element)))
def transform(filename): htmlfile = open(filename, encoding='latin-1') tree = html.parse(htmlfile, parser=parser) # Frontpage seems to use <font> tags to indicate headings for node in tree.xpath('//font'): size = int(node.get('size')) if 'size' in node.attrib else None color = node.get('color').lower() if 'color' in node.attrib else '' if color == '#ff0000': strong = html.Element('strong') replace_tag(node, strong) elif size == 6: h1 = html.Element('h1') replace_tag(node, h1) elif size == 5: h2 = html.Element('h2') replace_tag(node, h2) elif size == 4: node.drop_tag() # We rewrite all the urls to point to MD files instead of HTM for node in tree.xpath('//a[@href]'): href = node.get('href') try: parsed_url = urlparse(href) path, filename = os.path.split(parsed_url.path) basename, extension = filename.split('.') hostname = parsed_url.hostname except ValueError: continue else: if hostname and hostname.startswith('anastasis'): hostname = None if extension.startswith('htm'): if path: new_path = '{}{}.{}'.format(path.lstrip('/'), basename, 'md') else: new_path = '{}.{}'.format(basename, 'md') new_url = '', '', new_path, '', '', parsed_url.fragment node.set('href', urlunparse(new_url)) # Pandoc passes this through, cluttering up the final markdown. Must come # after footnore rewriting. for node in tree.xpath('//span[@class="MsoFootnoteReference"]'): node.drop_tag() remove_empty(tree) return etree.tostring(tree, pretty_print=True, method='html', encoding='unicode')
def render(self, id_or_xml_id, values=None, **options): """ render(id_or_xml_id, values, **options) Render the template specified by the given name. :param id_or_xml_id: name or etree (see get_template) :param dict values: template values to be used for rendering :param options: used to compile the template (the dict available for the rendering is frozen) * ``load`` (function) overrides the load method * ``profile`` (float) profile the rendering (use astor lib) (filter profile line with time ms >= profile) """ for method in dir(self): if method.startswith('render_'): _logger.warning("Unused method '%s' is found in ir.qweb." % method) context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode']) context.update(options) result = super(IrQWeb, self).render(id_or_xml_id, values=values, **context) if b'data-pagebreak=' not in result: return result fragments = html.fragments_fromstring(result) for fragment in fragments: for row in fragment.iterfind('.//tr[@data-pagebreak]'): table = next(row.iterancestors('table')) newtable = html.Element('table', attrib=dict(table.attrib)) thead = table.find('thead') if thead: newtable.append(copy.deepcopy(thead)) # TODO: copy caption & tfoot as well? # TODO: move rows in a tbody if row.getparent() is one? pos = row.get('data-pagebreak') assert pos in ('before', 'after') for sibling in row.getparent().iterchildren('tr'): if sibling is row: if pos == 'after': newtable.append(sibling) break newtable.append(sibling) table.addprevious(newtable) table.addprevious( html.Element('div', attrib={'style': 'page-break-after: always'})) return b''.join(html.tostring(f) for f in fragments)
def get_html(url): try: res = requests.get(url, timeout=30) parsed_page = html.fromstring(res.content) except requests.exceptions.Timeout: log.error(Directory.ERROR_MAP[4] % url) return html.Element('html') except: return html.Element('html') return parsed_page
def getMainPage(self, mathOutput='html'): if mathOutput == 'svg': html = HTML.Element('html') head = HTML.Element('head') body = HTML.Element('body') html.append(head) html.append(body) self._prepareHead(head, mathOutput='svg') self._prepareBody(body) return HTML.tostring(html) else: return HTML.tostring(self._html)
def _extract_content(self): """ Pull information from HTML table 1. Ignore TH/TD distinction 2. remove content that only changes presentation 3. assume script/comment tags do not contain data creates a new element fragment and List[List[Str]] embedded UL are converted into a comma delimited string """ #print(f"input table ===>{html.tostring(self.orig_element)}<<====\n") self.id = self.orig_element.get("id") if self.id != None: self._new_element.attrib["id"] = self.id tr_temp = html.Element("tr") for x in self.orig_element: #print(f"row ===>{html.tostring(x)}<<====\n") # -- handle TD that are missing surrounding TR if x.tag == "td": logger.warning(f"misplaced TD: {html.tostring(x)}") tr_temp.append(x) continue #self._extract_td(x) elif len(tr_temp) > 0: self._extract_tr(tr_temp) tr_temp = html.Element("tr") if x.tag == "tr": self._extract_tr(x) elif x.tag == "thead" or x.tag == "tbody" or x.tag == "tfoot": for y in x: if y.tag == "tr": self._extract_tr(y) elif self.fail_on_unexpected_tags: raise Exception(f"unexpected tag in tr: {y.tag}") else: logger.warning( f"unexpected tag in tr: {html.tostring(y)}") elif x.tag == "colgroup": # logger.warning(f"colgroup: {html.tostring(x)}") pass elif x.tag == "caption": self._extract_caption(x) elif self.fail_on_unexpected_tags: logger.warning(f"unexpected tag in table: {html.tostring(x)}") raise Exception(f"unexpected tag in table: {x.tag}") else: logger.warning(f"unexpected tag: {html.tostring(x)}")
def make_source_link(kind: str, stage: str, name: str) -> html.Element: d = html.Element("span") if kind != stage and kind != "source": a = html.Element("a") # "http://covid19-api.exemplartech.com/github-data/raw/AZ.html a.attrib["href"] = f"../{stage}/{name}" a.text = stage d.append(a) else: d.text = stage d.tail = " < " return d
def separate_summary(htmlRoot): firstIndicator = None secondIndicator = None def drop_sibs(element, backwards=False): siblingIterator = element.itersiblings(preceding=backwards) for sibling in siblingIterator: sibling.drop_tree() def move_sibs(start, stop, destination): siblingIterator = start.itersiblings() destination.append(start) for sibling in siblingIterator: if sibling != stop: destination.append(sibling) else: break elements = htmlRoot.iter('h2') for element in elements: if str(element.text_content()).count('***'): if firstIndicator == None: firstIndicator = element elif secondIndicator == None: secondIndicator = element else: pass if firstIndicator == None: pass # more code needed - probably change html tag to div id="report" tag elif secondIndicator == None: summary = None report = html.Element('div', attrib={'id': 'report'}) report.tail = '\n' drop_sibs(firstIndicator, backwards=True) move_sibs(firstIndicator, None, report) else: summary = html.Element('div', attrib={'id': 'summary'}) summary.tail = '\n' report = html.Element('div', attrib={'id': 'report'}) report.tail = '\n' drop_sibs(firstIndicator, backwards=True) move_sibs(firstIndicator, secondIndicator, summary) move_sibs(secondIndicator, None, report) return([summary, report])
def write_as_html(self, foutput, name: str, url: str, tables: List[ContentTable], html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) for t in tables: s.append(t.new_element) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) h = html.Element("html") h.append(html.Element("body")) h[0].append(deepcopy(s)) foutput.write(html.tostring(h, pretty_print=True)) html_doc.append(s) html_doc.append(html.Element("hr"))
def write_cache(self, style, fonts, svgs): "Cache the computed data in an xml file" cache = html.Element('cache') elt = html.Element('style', id='pretex-style') elt.text = style cache.append(elt) elt = html.Element('style', id='pretex-fonts') elt.text = fonts cache.append(elt) for svg in svgs: svg.tail = '' cache.append(svg) with open(self.svg_cache, 'wb') as fobj: fobj.write(html.tostring(cache))
def parse_print_tab_kangxi(dict_root, homo_no, word, content): parsed_word={} if not (len(content) and parse_word_kangxi(word, content, parsed_word)): return False entry=html.Element("idx:entry", scriptable='yes') dict_root.append(entry) entry.append(html.Element("idx:orth",value=word)) #对“详细解释”页面的每一类按'详细字义','基本词义','词性变化'的顺序显示 #首先展示单词 b=html.Element('b') entry.append(b) #基本解释:homo_no='1',详细解释:homo_no='2' #make_sub_elem(b, 'word', {"homo_no":"1"}, parsed_ziyi['zi']) make_sub_elem(b, 'word', _text= word) entry.append(html.Element('br')) category=html.Element('category') entry.append(category) sense=html.Element('sense') category.append(sense) make_sub_elem(sense, 'description',_text=parsed_word['jianjie']) sense.append(html.Element('br')) for desc in parsed_word['jieshi']: make_sub_elem(sense, 'description',_text=desc) sense.append(html.Element('br')) make_sub_elem(dict_root,'hr') return True
def _add_html_info_row(self, t: html.Element, label: str, val: str, cls: str = None): tr = html.Element("tr") td = html.Element("td") td.text = label if cls != None: td.attrib["class"] = cls tr.append(td) td = html.Element("td") td.text = val if cls != None: td.attrib["class"] = cls tr.append(td) tr.tail = "\n " t.append(tr)
def get_wrapper_tag(self): if self.allow_tags is None: return if self.wrap_inline_tags in (None, True): if 'p' in self.allow_tags: return html.Element('p') elif 'div' in self.allow_tags: return html.Element('div') elif self.wrap_inline_tags in ('p', 'div'): if 'p' in self.allow_tags or 'div' in self.allow_tags: return html.Element(self.wrap_inline_tags) elif callable(self.wrap_inline_tags): element = self.wrap_inline_tags() if element.tag in self.allow_tags: return element
def load_info(self, item: ChangeItem, body: html.Element): body.text = "\n " h3 = html.Element("h3") h3.text = item.name h3.tail = "\n\n " body.append(h3) div = html_helpers.make_source_links("extract", item.name, item.source) body.append(div) body[len(body) - 1].tail = "\n " br = html.Element("br") br.tail = "\n " body.append(br)
def markdown(value, style, math_engine=None, lazy_load=False): styles = getattr(settings, 'MARKDOWN_STYLES', {}).get(style, getattr(settings, 'MARKDOWN_DEFAULT_STYLE', {})) escape = styles.get('safe_mode', True) nofollow = styles.get('nofollow', True) texoid = TEXOID_ENABLED and styles.get('texoid', False) math = hasattr(settings, 'MATHOID_URL') and styles.get('math', False) post_processors = [] if styles.get('use_camo', False) and camo_client is not None: post_processors.append(camo_client.update_tree) if lazy_load: post_processors.append(lazy_load_processor) renderer = AwesomeRenderer(escape=escape, nofollow=nofollow, texoid=texoid, math=math and math_engine is not None, math_engine=math_engine) markdown = mistune.Markdown(renderer=renderer, inline=AwesomeInlineLexer, parse_block_html=1, parse_inline_html=1) result = markdown(value) if post_processors: try: tree = html.fromstring(result, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if result and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): logger.exception('Failed to parse HTML string') tree = html.Element('div') for processor in post_processors: processor(tree) result = html.tostring(tree, encoding='unicode') return Markup(result)
def markdown(value, style, math_engine=None, lazy_load=False): styles = getattr(settings, 'MARKDOWN_STYLES', {}).get(style, getattr(settings, 'MARKDOWN_DEFAULT_STYLE', {})) escape = styles.get('safe_mode', True) nofollow = styles.get('nofollow', True) post_processors = [] if lazy_load: post_processors.append(lazy_load_processor) renderer = AwesomeRenderer(escape=escape, nofollow=nofollow) markdown = mistune.Markdown(renderer=renderer, inline=AwesomeInlineLexer, parse_block_html=1, parse_inline_html=1) result = markdown(value) if post_processors: try: tree = html.fromstring(result, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if result and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): raise ValueError('Failed to parse HTML string') tree = html.Element('div') for processor in post_processors: processor(tree) result = html.tostring(tree, encoding='unicode') return Markup(result)
def convert_html_to_text(html_str): """ If lxml is available, convert to Markdown (but badly) otherwise just strip_tags """ try: from lxml import html except ImportError: return strip_tags(html_str) root = html.fromstring(html_str) try: body = root.xpath('./body')[0] except IndexError: # No body element body = root for tag, func in HTML_CONVERTERS.items(): els = body.xpath('.//' + tag) for el in els: replacement = func(el) repl_tag = html.Element("span") repl_tag.text = replacement el.getparent().replace(el, repl_tag) text = html.tostring(body, pretty_print=True, method='text', encoding='utf-8').decode('utf-8') return '\n'.join(x.strip() for x in text.splitlines()).strip()
def clean17(filename, content): html_content = html.fromstring(content) s140s = html_content.xpath('//div[@class=\'s140\']') has_changed = False for s140 in s140s: if (s140.text and len(s140.text.strip()) > 0 and len(s140.getchildren()) > 0 and all(child.tag == 'br' for child in s140.getchildren())): text = [] text.append(s140.text.strip()) for child in s140.getchildren(): if (child.tail and len(child.tail.strip())): text.append(child.tail.strip()) if (all(clean17regex1.match(t) for t in text) or all(clean17regex2.match(t) for t in text)): has_changed = has_changed or True s140.text = '' for child in s140.getchildren(): s140.remove(child) for t in text: element = html.Element('li') element.text = t s140.append(element) s140.tag = 'ol' if has_changed: print filename content = etree.tostring(html_content) return content