def get_progressbar_element(percentage): progressbar_child_element = Element("div") percentage = int(percentage) if percentage >= 70: # green progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-success progress-bar-striped" elif percentage >= 50: progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-warning progress-bar-striped" else: progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-danger progress-bar-striped" progressbar_child_element.attrib["role"] = "progressbar" progressbar_child_element.attrib[ "aria-valuenow"] = "10" # "{}".format(percentage) progressbar_child_element.attrib["aria-valuemin"] = "0" progressbar_child_element.attrib["aria-valuemax"] = "100" progressbar_child_element.attrib[ "style"] = "width:{}%; text-align:left; padding-left: 5px;".format( percentage) progressbar_child_element.text = "{}%".format(percentage) progressbar_element = Element("div") progressbar_element.attrib["class"] = "progress" progressbar_element.attrib["style"] = "margin-bottom:0;" progressbar_element.append(progressbar_child_element) return progressbar_element
def get_user(username, rank): if rank is None: element = Element('span') else: element = Element('a', { 'class': rank, 'href': reverse('user_page', args=[username]) }) element.text = username return element
def get_user(username, data): if not data: element = Element('span') element.text = username return element element = Element('span', {'class': Profile.get_user_css_class(*data)}) link = Element('a', {'href': reverse('user_page', args=[username])}) link.text = username element.append(link) return element
def _apply_headers_anchors(html: str) -> str: root_element = fromstring(wrap_unwrap_fake_tag(html)) for element in root_element: if element.tag in HEADERS: id_ = make_header_id(element.text) a_element = Element('a', {'id': id_, 'href': f'#{id_}'}) span_element = Element('span', attrib={'class': 'iconify', 'data-icon': HTMLGen.ANCHOR_LINK_ICON_CLASS}) a_element.append(span_element) element.text += ' ' element.insert(0, a_element) html = tostring(root_element) html = wrap_unwrap_fake_tag(html, wrap=False) return html
def _apply_font_icons(html): root = fromstring(wrap_unwrap_fake_tag(html)) for element in root.iter('a'): resource = element.attrib.get('href') if not (resource and element.text): # .text empty in anchors <a> continue # External link if resource.startswith('https://github.com'): icon_class = HTMLGen.EXTERNAL_LINK_GITHUB_ICON_CLASS elif resource.startswith('http'): icon_class = HTMLGen.EXTERNAL_LINK_ICON_CLASS # Anchor elif resource.startswith('#'): icon_class = HTMLGen.ANCHOR_LINK_ICON_CLASS # File elif any(map(resource.endswith, HTMLGen.EXTENSIONS_ICON_CLASSES_MAP.keys())): extension = resource.rsplit('.', 1)[-1] icon_class = HTMLGen.EXTENSIONS_ICON_CLASSES_MAP[extension] else: print('Unknown icon resource ', resource) continue # Element prototype span_element = Element('span', attrib={'class': 'iconify', 'data-icon': icon_class}) span_element.tail = ' ' + element.text element.text = None element.insert(0, span_element) html = tostring(root) html = wrap_unwrap_fake_tag(html, wrap=False) return html
def main(): absolute_folder = sys.argv[1] pattern = '*.iml' fileList = [] # Walk through directory for dName, sdName, fList in os.walk(absolute_folder): for fileName in fList: if fnmatch.fnmatch(fileName, pattern): # Match search string fileList.append(os.path.join(dName, fileName)) pbar = ProgressBar(widgets=['Processing :', Percentage(), ' ', Bar(), ' ', ETA()], maxval=len(fileList)).start() fcount = 0 for fileName in fileList: output_dict = generate_empty_dict() eclipse_file_path = os.path.dirname(fileName)+'/.classpath' with open(fileName, 'r') as f: intellij_data = f.read() if not intellij_data: pass intellij_dict = xmltodict.parse(intellij_data) fcount = fcount + 1 # print(intellij_dict) output_dict = addSrcType(intellij_dict, output_dict) output_dict = addCombinedRules(intellij_dict, output_dict) output_dict = addConType(intellij_dict, output_dict) # print json.dumps(intellij_dict) result = bf.etree(output_dict, root=Element('classpath')) #print tostring(result) with open(eclipse_file_path, 'w') as f: data = tostring(result, doctype='<?xml version="1.0" encoding="UTF-8"?>') data = data.replace('<classpath>','') data = data.replace('</classpath>', '') data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', '<?xml version="1.0" encoding="UTF-8"?><classpath>') data = data +'</classpath>' f.write(data) # Add .project file project_path = os.path.dirname(fileName)+'/.project' xml_data = """<?xml version="1.0" encoding="UTF-8"?> <projectDescription> <name>%s</name> <comment/> <projects/> <buildSpec> <buildCommand> <name>org.eclipse.jdt.core.javabuilder</name> <arguments/> </buildCommand> </buildSpec> <natures> <nature>org.eclipse.jdt.core.javanature</nature> </natures> </projectDescription>""" root_name = os.path.splitext(os.path.basename(fileName))[0] xml_data = xml_data%(root_name) with open(project_path, 'w') as f: f.write(xml_data) pbar.update(fcount) pbar.finish()
def footer(self, node): """Create a standard footer block for HTML files.""" footer = Element('footer') footer.append(E.HR()) footer.append( E. P("Generated automatically from {source} at {time:%d %b %Y %H:%M}." .format(source=node.sourcefile, time=datetime.datetime.now())), ) return footer
def get_user_rating(username, rating): element = Element('a', { 'class': 'rate-group', 'href': reverse('user_page', args=[username]) }) if rating: rating_css = rating_class(rating) rate_box = Element('span', {'class': 'rate-box ' + rating_css}) rate_box.append( Element('span', {'style': 'height: %3.fem' % rating_progress(rating)})) user = Element('span', {'class': 'rating ' + rating_css}) user.text = username element.append(rate_box) element.append(user) else: element.text = username return element
def test_link_element(self): source = Element('link', {'href': '#'}) elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'href') self.assertEqual(el.tag, 'link') self.assertEqual(el.attrib, {'href': '#'})
def test_get_wrapper_tag(self): c = html.Cleaner(allow_tags=None, wrap_inline_tags='div') self.assertEqual(c.get_wrapper_tag(), None) c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags=False) self.assertEqual(c.get_wrapper_tag(), None) c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags=None) self.assertEqual(c.get_wrapper_tag().tag, 'p') c = html.Cleaner(allow_tags=['div'], wrap_inline_tags=None) self.assertEqual(c.get_wrapper_tag().tag, 'div') c = html.Cleaner(allow_tags=['b'], wrap_inline_tags=None) self.assertEqual(c.get_wrapper_tag(), None) c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags='div') self.assertEqual(c.get_wrapper_tag().tag, 'div') c = html.Cleaner(allow_tags=['p', 'div', 'span'], wrap_inline_tags=(lambda:Element('span'))) self.assertEqual(c.get_wrapper_tag().tag, 'span') c = html.Cleaner(allow_tags=['p', 'div'], wrap_inline_tags=(lambda:Element('span'))) self.assertEqual(c.get_wrapper_tag(), None)
def test_form_element(self): source = Element('form', {'action': '#'}) elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'action') self.assertEqual(el.tag, 'form') self.assertEqual(el.attrib, {'action': '#'})
def test_meta_refresh_element(self): source = Element('meta', {'http-equiv': 'refresh', 'content': '#'}) elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'content') self.assertEqual(el.tag, 'meta') self.assertEqual(el.attrib, {'http-equiv': 'refresh', 'content': '#'})
def test_inline_css_url(self): source = Element('div', {'style': 'background: url("#");'}) elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 17) self.assertEqual(attr, 'style') self.assertEqual(el.tag, 'div') self.assertEqual(el.attrib, {'style': 'background: url("#");'})
def test_script_element(self): source = Element('script', {'src': '#'}) elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 0) self.assertEqual(attr, 'src') self.assertEqual(el.tag, 'script') self.assertEqual(el.attrib, {'src': '#'})
def brs_to_paragraphs(tree, inline_tags=None): """ Return an lxml tree with all <br> elements stripped and paragraphs put in place where necessary. """ # add these tags to p's that we're currently building, any other tags will # close the current p inline_tags = inline_tags or ['a'] # if this tree doesn't have any child elements, just return it as is if len(tree) == 0: return tree # if this tree doesn't contain any <br> tags, we don't need to touch it if tree.find('.//br') is None: return tree # XXX: We're building a whole new tree here and leaving out any attributes. # A) That might be a little slower and more memory intensive than modifying # the tree in place, and B) we're dropping any attributes on block elements. # The latter is probably fine for current use, but certainly not ideal. new_tree = Element(tree.tag) # if this tree starts out with text, create a new paragraph for it, and # add it to the tree if tree.text: p = E.P() p.text = tree.text new_tree.append(p) for e in tree: if e.tag == 'br': # avoid adding empty p elements if e.tail is None: continue # start a new p p = E.P() p.text = e.tail new_tree.append(p) # if this is a block tag, and it has trailing text, that text needs to # go into a new paragraph... only if the tail has actual content and # not just whitespace though. elif e.tail and re.match('[^\s]', e.tail) and e.tag not in inline_tags: p = E.P() p.text = e.tail e.tail = '' new_tree.append(e) new_tree.append(p) # keep inline tags inside the current paragraph elif e.tag in inline_tags: p.append(e) else: new_tree.append(brs_to_paragraphs(e)) return new_tree
def test_html_style_tag_css_import(self): source = Element('style') source.text = '@import url(#);' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 12) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {})
def test_html_style_tag_css_url_with_altering_colons(self): source = Element('style') source.text = 'html {background: url("#\');}' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 23) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {})
def test_script_element_with_url_in_the_text(self): source = Element('script') source.text = 'var background = "url(\'image.jpg\')"' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, 'image.jpg') self.assertEqual(pos, 23) self.assertEqual(attr, None) self.assertEqual(el.tag, 'script') self.assertEqual(el.attrib, {})
def add_bootstrap_in_html_header(input_filename, output_filename): with open(input_filename) as fp: htmlstring = fp.read() bootstrap = Element("link") bootstrap.attrib["rel"] = "stylesheet" bootstrap.attrib["href"] = "static/css/bootstrap.min.css" bootstrap.attrib["type"] = "text/css" html = lhtml.fromstring(htmlstring) html.head.append(bootstrap) with open(output_filename, "w") as fp: fp.write(lhtml.tostring(html, encoding=str))
def inject_script_tag(self, html): root = lxml.html.fromstring(html) if root is None: # Sometimes non-html sneaks through the header check return html with open('mask_headless.js') as f: content_js = f.read() script = Element("script") script.text = content_js root.insert(0, script) html = lxml.html.tostring(root, method="html").decode('utf-8') return html
def wrap_set(dom, child_tag, parent_tag): """Wrap unbroken sets of elements in a parent container: - <li> in a <ul> - <tr> in a <table> """ nxt = 0 for e in dom.cssselect(child_tag): if nxt != e: box = Element(parent_tag) insert(box, e) box.append(e) nxt = parent(e).getnext() if nxt is None: nxt = e.getnext()
def make_html_element( tag, text='', children=None, tail=None, ): 'Make a HtmlElement object' el = Element(tag) if text is not None: el.text = text if children is not None: el.extend(children) if tail is not None: el.tail = text return el
def test_img_src_set_attribute_bad_formatted(self): source = Element('img', {'src-set': 'img1 1x; img2 2x,'}) elements = list(links(source)) self.assertEqual(len(elements), 2) el, attr, url, pos = elements.pop() self.assertEqual(url, 'img1') self.assertEqual(pos, 0) self.assertEqual(attr, 'src-set') self.assertEqual(el.tag, 'img') self.assertEqual(el.attrib, {'src-set': 'img1 1x; img2 2x,'}) el, attr, url, pos = elements.pop() self.assertEqual(url, 'img2') self.assertEqual(pos, 9) self.assertEqual(attr, 'src-set') self.assertEqual(el.tag, 'img') self.assertEqual(el.attrib, {'src-set': 'img1 1x; img2 2x,'})
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError("string required") accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text, ) if create_parent: if not isinstance(create_parent, _strings): create_parent = "div" new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError("No elements found") if len(elements) > 1: raise etree.ParserError("Multiple elements found") result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError("Element followed by text: %r" % result.tail) result.tail = None return result
def reconstruct_tables(dom, data): # order data vertically into row lists by page, row and finally column rows = collections.OrderedDict() cboxes = {} for c in sorted(data, key=lambda c: (c.page, c.y, c.x)): # combine page number and row position to get a useful key key = '%d,%d' % (c.page, c.y) # create row lists(y) and clipbox groups(x) rows.setdefault(key, []).append(c) cboxes.setdefault(c.clipbox, []).append(c.elem) from pprint import pprint # pprint(rows) # collect cell lines with same clip boxes merged = [] for key, row in rows.items(): for cell in row: if cell.clipbox in merged: rows[key] = [c for c in rows[key] if c != cell] else: cell.lines = cboxes[cell.clipbox] merged.append(cell.clipbox) for row in rows.values(): # hardly a table row if there is only one non-empty element in it at the start of a line if len([c for c in row if c.text]) > 1: tr = parent(row[0].elem) tr.tag = 'tr' for cell in row: cell.elem.tag = 'td' cell.elem.attrib['class'] = '' for line in cell.lines[1:]: line.attrib['class'] = '' if BR: cell.elem.append(Element('br')) cell.elem.append(line) tr.append(cell.elem) # drop empty span, divs for e in dom.iter(): if e.tag in ('span', 'div' ) and not e.text_content() or e.text_content() == ' ': e.drop_tag() wrap_set(dom, 'tr', 'table') return dom
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
def open_tag(self, tag, attrs): """ Handle a new tag. Performs autoclose checks. """ new = Element(tag) attrs = ((k, v or "") for k, v in attrs) new.attrib.update(attrs) if self.root is None: self.root = new else: if not self.stack: self.error("Unexpected open tag: %s" % (tag)) self.check_autoclose_on_open(tag) parent = self.stack[-1] parent.append(new) self.stack.append(new) if tag == "svg": self.in_svg = True
def test_html_style_tag_font_face(self): source = Element('style') source.text = """ @font-face { font-family:'fontawesome'; src:url('../lib/fonts/fontawesome.eot?14663396#iefix') format('embedded-opentype'), url('../lib/fonts/fontawesome.woff?14663396') format('woff'), url('../lib/fonts/fontawesome.ttf?14663396') format('truetype'), url('../lib/fonts/fontawesome.svg?14663396#fontawesome') format('svg'); font-style:normal; } """ elements = list(links(source)) self.assertEqual(len(elements), 4) el, attr, url, pos = elements.pop() self.assertEqual(url, '../lib/fonts/fontawesome.eot?14663396#iefix') self.assertEqual(pos, 74) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {}) el, attr, url, pos = elements.pop() self.assertEqual(url, '../lib/fonts/fontawesome.woff?14663396') self.assertEqual(pos, 162) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {}) el, attr, url, pos = elements.pop() self.assertEqual(url, '../lib/fonts/fontawesome.ttf?14663396') self.assertEqual(pos, 232) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {}) el, attr, url, pos = elements.pop() self.assertEqual(url, '../lib/fonts/fontawesome.svg?14663396#fontawesome') self.assertEqual(pos, 305) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {})
def parse_item(self, response): item = self.get_new_item(response) html = utils.decode(response.body, response.encoding) article = response.xpath("//div[contains(@class,'item-page')]") caption_images = {} caption_imgs = article.xpath(".//p/span[contains(@class,'wf_caption')]") imgs = article.xpath(".//p/img") for caption_img in caption_imgs: if caption_img.xpath(".//img/@alt"): image_alt = caption_img.xpath(".//img/@alt").extract()[0].strip() elif caption_img.xpath(".//img/@title"): image_alt = caption_img.xpath(".//img/@title").extract()[0].strip() else: image_alt = None caption_images[urlparse.urljoin(response.url, caption_img.xpath(".//img/@src").extract()[0].strip())] = ("".join(caption_img.xpath(".//span").extract()),image_alt) for img in imgs: if img.xpath(".//@alt"): image_alt = img.xpath(".//@alt").extract()[0].strip() elif img.xpath(".//@title"): image_alt = img.xpath(".//@title").extract()[0].strip() else: image_alt = None caption_images[urlparse.urljoin(response.url, img.xpath(".//@src").extract()[0])] = (None,image_alt) slid_imgs = [] for main_images_wrapper in response.xpath("//div[contains(@id,'main_images_wrapper')]"): desc = [] image_info = [] for main_des_container in main_images_wrapper.xpath(".//div[contains(@id,'main_des_container')]/div[contains(@class,'des_div')]/p"): desc.append(main_des_container.xpath(".//text()").extract()[0].strip()) for img_tag in main_images_wrapper.xpath(".//div[contains(@id,'main_thumbs_arrow_wrapper')]/div[contains(@id,'main_thumb_container')]//img[contains(@class,'ig_thumb')]"): if img_tag.xpath(".//@src"): path = urlparse.urljoin(response.url,img_tag.xpath(".//@src").extract()[0].strip().replace("120-90-80-c","600-450-80")) slid_imgs.append(path) else: path = "" alt = img_tag.xpath(".//@alt").extract()[0].strip() if img_tag.xpath(".//@alt") else None image_info.append((path,alt)) for i in xrange(len(desc)): caption_images[image_info[i][0]] = (desc[i],image_info[i][1]) item['image_urls'] = [u for u in caption_images.keys()] item["json"]["caption_images"] = caption_images createdby = response.xpath("//dd[@class='createdby']//text()").extract()[0] createdBySplit = createdby.split('|') if len(createdBySplit) >= 2: item['json']['author'] = createdBySplit[0].strip()[3:] item['json']['date'] = createdBySplit[1].strip() else: item['json']['date'] = createdby.strip() item['json']['title'] = article.xpath(".//h2/text()").extract()[0].strip() item['json']['item_url'] = response.url content_document=fromstring(article.extract()[0].strip()) del_title = content_document.xpath(".//h2")[0] del_title.getparent().remove(del_title) del_author_date = content_document.xpath(".//dl[contains(@class,'article-info')]")[0] del_author_date.getparent().remove(del_author_date) if content_document.xpath(".//div[contains(@id,'main_images_wrapper')]"): del_main_images_wrapper = content_document.xpath(".//div[contains(@id,'main_images_wrapper')]")[0] for image_url in slid_imgs: img_doc = Element("img",**{"src":image_url}) del_main_images_wrapper.addprevious(img_doc) del_main_images_wrapper.getparent().remove(del_main_images_wrapper) del_igallery_clear_div = content_document.xpath(".//div[contains(@class,'igallery_clear')]")[0] del_igallery_clear_div.getparent().remove(del_igallery_clear_div) captions = content_document.xpath(".//p/span[contains(@class,'wf_caption')]") if captions: for caption in captions: keep_img = caption.xpath(".//img")[0] caption.addnext(keep_img) caption.getparent().remove(caption) item["json"]["content"] = tostring( content_document, encoding="UTF-8" ) sourceurl = response.meta['source_url'] item['json']['category'] = response.meta['category'] item['html'] = html htmls_path = { sourceurl:html } item["htmls_path"] = htmls_path item['source_url'] = sourceurl return item
def semanticize(doc_path='test.html'): """ P: unbroken set of lines (.t divs) of the same look make one <p> H1-3: Top 3 kinds of font size are turned to h1, h2 and h3. TABLE: use x and y position to indicate <td>, TODO: colspan support """ print(doc_path) dom, dimensions = prepare(doc_path) get_dimension = lambda el, dim_type: dimensions[dim_type].get( classN(dim_type, el)) or 0 # recover text from embedded fonts with bad CMAPS if > 50% of characters are unicode PUA recover = pua_content(dom.text_content()) > 0.5 if recover: print('Recovery needed, not now.') return recover_text(dom, os.path.dirname(doc_path)) # remove paging headers if REMOVE_HEADERS: dom = remove_headers(dom) # remove javascript holders for div in dom.cssselect('.j'): remove(div) if TABLES: table_data = grid_data(dom, get_dimension) dom = reconstruct_tables(dom, table_data) h_levels = heading_levels(dom, dimensions) # line by line analysis and conversion p_look = p_height = p_space = p_tag = box = 0 for l in dom.cssselect('.t'): # Gather information about this line to see if it's part of a block. # 1. detect change of look - different css classes from previous line look = ' '.join([ c for c in l.attrib['class'].split() if c[0] != 'y' and c[0:2] != 'fc' ]) # ignore y pos and font color new_look = p_look != look # 2. detect change of margin height - larger difference in bottom position from previous line height = get_dimension(l, 'h') line_height = p_height - height margin = line_height > MAX_LINE_HEIGHT # 3. space above - preceding empty line space = not l.text_content().strip() # Based on collected info: does this line belong to previous line? append = new_look == p_space == margin == False txt = l.text_content() tag = 'p' # LI indent = 'x0' not in look # there is some indentation if [1 for b in BULLETS if txt.startswith(b)]: tag = 'li' append = 0 elif indent and p_tag == 'li': tag = 'li' append = 1 # H1, H2... size = classN('fs', l) if size in h_levels.keys(): append = 0 tag = 'h%s' % h_levels[size] # merge multiline-elements if txt.strip(): if append: if BR: box.append(Element('br')) box.append(l) else: box = l l.tag = tag else: remove(l) if DEBUG: mark = ('<%s>' % tag).ljust(5) if append: mark = 5 * ' ' print(' Aa %d ⇪ %d ⇕ % 3d %s %s %s' %\ (new_look, p_space, line_height, l.attrib['class'].ljust(40), mark, txt)) # save current values for comparison in the next loop iteration p_space, p_height, p_look, p_tag = space, height, look, tag wrap_set(dom, 'li', 'ul') if STRIP_CSS: for e in dom.cssselect("style"): remove(e) for attr in 'style id class data-page-no data-data'.split(): for e in dom.cssselect("*"): try: del e.attrib[attr] except KeyError: pass # save file html = tostring(dom, encoding=ENCODING, pretty_print=True).decode(ENCODING) s = '<!DOCTYPE html>' + html for a, b in REPLACE_AFTER: s = re.sub(a, b, s) for rm in REMOVE_AFTER: s = re.sub(rm, '', s) for b in BULLETS: s = s.replace(b, '') if recover: for rm in REMOVE_BEFORE: s = re.sub(rm, '', s) save_path = os.path.dirname(doc_path.replace('HTML', 'HTM')) + '.htm' f = open(save_path, 'w', encoding=ENCODING) f.write(s) f.close()