def write_miss_to_html(self, name: str, url: str, msg: str, html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) m = html.Element("span") m.text = msg s.append(m) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) html_doc.append(s) html_doc.append(html.Element("hr"))
def parse_game_metadata(self, doc: Element) -> Dict[str, str]: # Parse data by id date = doc.xpath('//span[@id="fechaLabel"]') hour = doc.xpath('//span[@id="horaLabel"]') league = doc.xpath('//span[@id="paginaTitulo_ligaLabel"]') season = doc.xpath('//span[@id="paginaTitulo_temporadaLabel"]') home_team = doc.xpath('//a[@id="equipoLocalHyperLink"]') home_score = doc.xpath('//span[@id="resultadoLocalLabel"]') away_team = doc.xpath('//a[@id="equipoVisitanteHyperLink"]') away_score = doc.xpath('//span[@id="resultadoVisitanteLabel"]') main_referee = doc.xpath('//span[@id="arbitroPrincipalLabel"]') second_referee = doc.xpath('//span[@id="arbitroAuxiliarLabel"]') metadata_dict = { "date": self.parse_str(date[0].text_content()), "hour": self.parse_str(hour[0].text_content()), "league": self.parse_str(league[0].text_content()), "season": self.parse_str(season[0].text_content()), "home_team": self.parse_str(home_team[0].text_content()), "home_score": self.parse_str(home_score[0].text_content()), "away_team": self.parse_str(away_team[0].text_content()), "away_score": self.parse_str(away_score[0].text_content()), "main_referee": self.parse_str(main_referee[0].text_content()), "second_referee": self.parse_str(second_referee[0].text_content()), } return metadata_dict
def write_as_html(self, foutput, name: str, url: str, tables: List[ContentTable], html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) for t in tables: s.append(t.new_element) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) h = html.Element("html") h.append(html.Element("body")) h[0].append(deepcopy(s)) foutput.write(html.tostring(h, pretty_print=True)) html_doc.append(s) html_doc.append(html.Element("hr"))
def _apply_font_icons(html): root = fromstring(wrap_unwrap_fake_tag(html)) for element in root.iter('a'): resource = element.attrib.get('href') if not (resource and element.text): # .text empty in anchors <a> continue # External link if resource.startswith('https://github.com'): icon_class = HTMLGen.EXTERNAL_LINK_GITHUB_ICON_CLASS elif resource.startswith('http'): icon_class = HTMLGen.EXTERNAL_LINK_ICON_CLASS # Anchor elif resource.startswith('#'): icon_class = HTMLGen.ANCHOR_LINK_ICON_CLASS # File elif any(map(resource.endswith, HTMLGen.EXTENSIONS_ICON_CLASSES_MAP.keys())): extension = resource.rsplit('.', 1)[-1] icon_class = HTMLGen.EXTENSIONS_ICON_CLASSES_MAP[extension] else: print('Unknown icon resource ', resource) continue # Element prototype span_element = Element('span', attrib={'class': 'iconify', 'data-icon': icon_class}) span_element.tail = ' ' + element.text element.text = None element.insert(0, span_element) html = tostring(root) html = wrap_unwrap_fake_tag(html, wrap=False) return html
def parse_li(li: Element, path: str, index: int): """ :param """ prefix = "000" + str(index) prefix = prefix[len(prefix) - 2:] if check_li(li): tem_dir = li.xpath("./a/text()") tem_a_url = li.xpath("./a/@href") if tem_dir: a_href = urljoin(BASE_URL, tem_a_url[0]) # 可以得到最内部的url,进行对应的解析了 res_ = requests.get(a_href) content_ = res_.text.encode("ISO-8859-1").decode('utf-8') file_name = path + "/{}-".format(prefix) + tem_dir[0].strip() + ".md" run(content_, '//*[@id="book-search-results"]/div[1]/section/*', file_name, base_url="http://www.topgoer.com") # print(file_name, a_href) return else: # 创建文件夹 a_title = li.xpath("./a/text()") full_path = "" if a_title: # 创建文件夹 sub_path = a_title[0].strip() full_path = path + "/{}-".format(prefix) + sub_path if not os.path.exists(full_path): os.makedirs(full_path) li_list_ = li.xpath("./ul/li") for i_ in range(len(li_list_)): parse_li(li_list_[i_], full_path, i_)
def process_img(self, doc, el): """ Process <img> tag in the source document. """ self.add_alt_tags(el) # Skip over images with the nomobileresize attribute if el.attrib.pop("nomobileresize", "") != "": return src = el.attrib.get("src", None) if src: originalSrc = src site = getSite() # catch exceptions to ensure broken images don't # prevent the page from rendering try: src = self.rewrite(src) shorturl = getUtility(IMobileImageShortURLStorage) key = shorturl.getkey(src) if key is None: key = shorturl.suggest() # just check that suggest() is working as expected assert shorturl.get(key) is None shorturl.add(key, src) src = '%s/@@shortimageurl/%s' % (site.absolute_url(), key) el.attrib["src"] = src except: # blank alt text del el.attrib["alt"] el.attrib["src"] = src error = ['src: %s' % src, 'URL: %s' % site.REQUEST.URL, 'Referer: %s' % site.REQUEST.HTTP_REFERER, 'User Agent: %s' % site.REQUEST.get('HTTP_USER_AGENT', 'Unknown'), traceback.format_exc()] # Stop logging image processing errors, it creates # unnecessary noise in the error log # error = '\n'.join(error) # LOG.info(error) # Make image clickable and point to original src a = Element('a') a.attrib['href'] = originalSrc el.getparent().replace(el, a) a.append(el) # Remove explicit width declarations if "width" in el.attrib: del el.attrib["width"] if "height" in el.attrib: del el.attrib["height"] if self.needs_clearing(el): self.clear_floats(el) self.add_processed_class(el)
def _dumps_xml_from_pml_nodes(root_node): node_name, attributes, sub_nodes = root_node element = Element(node_name, **attributes) for sub_node in sub_nodes: element.append(_dumps_xml_from_pml_nodes(sub_node)) return element
def _indent_elem(self, elem: html.Element, depth: int): if len(elem) > 0: elem.text = self._indent_text(elem.text, depth + 1) for ch in elem: self._indent_elem(ch, depth + 1) elem[-1].tail = self._indent_text(elem.tail, depth) elem.tail = self._indent_text(elem.tail, depth) else: elem.text = self._indent_text(elem.text, 0) elem.tail = self._indent_text(elem.tail, depth)
def get_user(username, rank): if rank is None: element = Element('span') else: element = Element('a', { 'class': rank, 'href': reverse('user_page', args=[username]) }) element.text = username return element
def brs_to_paragraphs(tree, inline_tags=None): """ Return an lxml tree with all <br> elements stripped and paragraphs put in place where necessary. """ # add these tags to p's that we're currently building, any other tags will # close the current p inline_tags = inline_tags or ['a'] # if this tree doesn't have any child elements, just return it as is if len(tree) == 0: return tree # if this tree doesn't contain any <br> tags, we don't need to touch it if tree.find('.//br') is None: return tree # XXX: We're building a whole new tree here and leaving out any attributes. # A) That might be a little slower and more memory intensive than modifying # the tree in place, and B) we're dropping any attributes on block elements. # The latter is probably fine for current use, but certainly not ideal. new_tree = Element(tree.tag) # if this tree starts out with text, create a new paragraph for it, and # add it to the tree if tree.text: p = E.P() p.text = tree.text new_tree.append(p) for e in tree: if e.tag == 'br': # avoid adding empty p elements if e.tail is None: continue # start a new p p = E.P() p.text = e.tail new_tree.append(p) # if this is a block tag, and it has trailing text, that text needs to # go into a new paragraph... only if the tail has actual content and # not just whitespace though. elif e.tail and re.match('[^\s]', e.tail) and e.tag not in inline_tags: p = E.P() p.text = e.tail e.tail = '' new_tree.append(e) new_tree.append(p) # keep inline tags inside the current paragraph elif e.tag in inline_tags: p.append(e) else: new_tree.append(brs_to_paragraphs(e)) return new_tree
def add_bootstrap_in_html_header(input_filename, output_filename): with open(input_filename) as fp: htmlstring = fp.read() bootstrap = Element("link") bootstrap.attrib["rel"] = "stylesheet" bootstrap.attrib["href"] = "static/css/bootstrap.min.css" bootstrap.attrib["type"] = "text/css" html = lhtml.fromstring(htmlstring) html.head.append(bootstrap) with open(output_filename, "w") as fp: fp.write(lhtml.tostring(html, encoding=str))
def parse_news_block(element: Element, basic_url: Str) -> Union[Dict, None]: payload = dict() try: payload["news_title"] = element.xpath( ".//span[@class='newslist__text-title']/text()")[0] payload["news_url"] = "".join([basic_url, element.xpath("./@href")[0]]) payload["image_url"] = "".join( [basic_url, element.xpath(".//img/@src")[0]]) except IndexError: return None return payload
def test_html_style_tag_css_import(self): source = Element('style') source.text = '@import url(#);' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 12) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {})
def get_user(username, data): if not data: element = Element('span') element.text = username return element element = Element('span', {'class': Profile.get_user_css_class(*data)}) link = Element('a', {'href': reverse('user_page', args=[username])}) link.text = username element.append(link) return element
def indent_data_table(self, t: html.Element) -> html.Element: prefix = "\n " xprefix = prefix + " " t.text = xprefix t.tail = prefix for ch in t: if len(ch) > 0: self.indent_element(ch, 0, xprefix) ch.tail = xprefix t[-1].tail = prefix
def test_script_element_with_url_in_the_text(self): source = Element('script') source.text = 'var background = "url(\'image.jpg\')"' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, 'image.jpg') self.assertEqual(pos, 23) self.assertEqual(attr, None) self.assertEqual(el.tag, 'script') self.assertEqual(el.attrib, {})
def test_html_style_tag_css_url_with_altering_colons(self): source = Element('style') source.text = 'html {background: url("#\');}' elements = list(links(source)) self.assertEqual(len(elements), 1) el, attr, url, pos = elements.pop() self.assertEqual(url, '#') self.assertEqual(pos, 23) self.assertEqual(attr, None) self.assertEqual(el.tag, 'style') self.assertEqual(el.attrib, {})
def _inject_extra_elements(self, tree: html.Element, xurl: str): if xurl == None: return if len(tree) == 0 or tree[0].tag != "head": return base = tree.findall("base") if len(base) > 0: return base = html.Element("base") base.attrib["ref"] = xurl tree.insert(0, base)
def clean_element(self, elem: html.Element): tag = elem.tag if tag in [ "script", "noscript", "style", "meta", "input", "iframe", "select", "link", "font" ]: elem.getparent().remove(elem) return if tag == etree.Comment: elem.getparent().remove(elem) return if tag == etree.ProcessingInstruction: elem.getparent().remove(elem) return if tag == "form": a = elem.attrib.get("action") if a != None: del elem.attrib["action"] x = elem.attrib.get("onsubmit") if x != None: del elem.attrib["onsubmit"] if tag == "a": href = elem.attrib.get("href") if href != None and (href.startswith("https://twitter.com") or href.startswith("http://twitter.com") or href.startswith("https://t.co")): self.remove_twitter_cluster(elem.getparent()) if tag == "svg": while len(elem): del elem[0] else: for ch in elem: self.clean_element(ch) if tag in ["div", "span"]: if self.is_empty(elem): elem.getparent().remove(elem) return if self.mark_special_case(elem): return elif tag in ["a"]: if self.mark_special_case(elem): return # strip spaces from simple links if len(elem) > 0: elem[-1].tail = None elif elem.text != None: elem.text = elem.text.strip() self.clean_attributes(elem)
def inject_script_tag(self, html): root = lxml.html.fromstring(html) if root is None: # Sometimes non-html sneaks through the header check return html with open('mask_headless.js') as f: content_js = f.read() script = Element("script") script.text = content_js root.insert(0, script) html = lxml.html.tostring(root, method="html").decode('utf-8') return html
def brs_to_paragraphs(tree, inline_tags=None): """ Return an lxml tree with all <br> elements stripped and paragraphs put in place where necessary. """ # add these tags to p's that we're currently building, any other tags will # close the current p inline_tags = inline_tags or ["a"] # if this tree doesn't have any child elements, just return it as is if len(tree) == 0: return tree # if this tree doesn't contain any <br> tags, we don't need to touch it if tree.find(".//br") is None: return tree # XXX: We're building a whole new tree here and leaving out any attributes. # A) That might be a little slower and more memory intensive than modifying # the tree in place, and B) we're dropping any attributes on block elements. # The latter is probably fine for current use, but certainly not ideal. new_tree = Element(tree.tag) # if this tree starts out with text, create a new paragraph for it, and # add it to the tree if tree.text: p = E.P() p.text = tree.text new_tree.append(p) for e in tree: if e.tag == "br": # avoid adding empty p elements if e.tail is None: continue # start a new p p = E.P() p.text = e.tail new_tree.append(p) # if this is a block tag, and it has trailing text, that text needs to # go into a new paragraph... only if the tail has actual content and # not just whitespace though. elif e.tail and re.match("[^\s]", e.tail) and e.tag not in inline_tags: p = E.P() p.text = e.tail e.tail = "" new_tree.append(e) new_tree.append(p) # keep inline tags inside the current paragraph elif e.tag in inline_tags: p.append(e) else: new_tree.append(brs_to_paragraphs(e)) return new_tree
def wrap_set(dom, child_tag, parent_tag): """Wrap unbroken sets of elements in a parent container: - <li> in a <ul> - <tr> in a <table> """ nxt = 0 for e in dom.cssselect(child_tag): if nxt != e: box = Element(parent_tag) insert(box, e) box.append(e) nxt = parent(e).getnext() if nxt is None: nxt = e.getnext()
def _apply_headers_anchors(html: str) -> str: root_element = fromstring(wrap_unwrap_fake_tag(html)) for element in root_element: if element.tag in HEADERS: id_ = make_header_id(element.text) a_element = Element('a', {'id': id_, 'href': f'#{id_}'}) span_element = Element('span', attrib={'class': 'iconify', 'data-icon': HTMLGen.ANCHOR_LINK_ICON_CLASS}) a_element.append(span_element) element.text += ' ' element.insert(0, a_element) html = tostring(root_element) html = wrap_unwrap_fake_tag(html, wrap=False) return html
def _add_html_info_row(self, t: html.Element, label: str, val: str, cls: str = None): tr = html.Element("tr") td = html.Element("td") td.text = label if cls != None: td.attrib["class"] = cls tr.append(td) td = html.Element("td") td.text = val if cls != None: td.attrib["class"] = cls tr.append(td) tr.tail = "\n " t.append(tr)
def del_with_table(sub: Element, file: TextIO): col_num = 0 col_name = sub.xpath("//thead/tr/th/text()") col_num = len(col_name) row_head = "| " + " | ".join(col_name) + " |\n" file.write(row_head) # print(row_head) tem_list = "| " + " | ".join([":-----:" for i in range(col_num)]) + " |\n" # print(tem_list) file.write(tem_list) # 解决表体 trs = sub.xpath("//tbody/tr") for tr in trs: row = tr.xpath("./td/text()") row_each = "| " + " | ".join(row).replace("\n", "") + " |\n" file.write(row_each)
def mark_special_case(self, elem: html.Element) -> bool: " edit or return element to remove " # -- stupid special cases for CA #if elem.tag == "div": # xid = elem.get("id") # if xid == "DeltaPlaceHolderPageDescription" or xid == "DeltaPlaceHolderPageTitleInTitleArea": # logger.debug("special case: remove deltaplaceholder") # self.to_remove.append(elem.getparent()) # return True #elif elem.tag == "a": # href = elem.get("href") # if href == "#ctl00_ctl65_SkipLink": # logger.debug("special case: remove skiplink") # self.to_remove.append(elem.getparent()) # return True if elem.tag == "div": xid = elem.attrib.get("id") if xid == "google_translate_element" and len(elem) > 0: logger.debug("special case: google_translate_element") return elem[0] if xid != None: xid2 = re.sub("^[0-9a-fA-F]+-(.*)", "\\1", xid) xid2 = re.sub("(.*)-[a-z]?[0-9a-fA-F]+$", "\\1", xid2) if xid != xid2: logger.debug("special case: hex data in id") elem.attrib["id"] = xid2 if elem.attrib.get("fb-xfbml-state"): logger.debug("special case: fb") return elem return False
def del_with_ul(tag: Element, file: TextIO, space_num=0, prefix=None): """ :param file: 文件 :param prefix: 前缀 :arg tag 包含ul的标签 :arg space_num 空格个数控制格式 tag的格式是 """ tag = etree.HTML( etree.tostring(tag, encoding="utf-8", pretty_print=True, method="html").decode()) li = tag.xpath("/html/body/ul/li") # 没有li标签 if not li: return elif len(li) == 0: return else: for k in li: tem_ = k.xpath("./text()") if tem_: if not prefix: line = " " * space_num + "* " + tem_[0].replace("\n", "") + "\n" file.write(line) # print(" " * space_num, "*", k.xpath("./text()")[0].replace("\n", "")) else: line = prefix + " " + " " * space_num + "* " + tem_[ 0].replace("\n", "") + "\n" file.write(line) # print(prefix, end="") # print(" " * space_num, "*", k.xpath("./text()")[0].replace("\n", "")) tem = k.xpath("./ul") if tem is not None and len(tem) > 0: del_with_ul(tem[0], space_num=space_num + 1, file=file)
def cssselect(node: Element, selector: str) -> Element: result = node.cssselect(selector) if len(result) != 1: raise Exception( f"Selector {str} on node {inner_html(node)} gave {len(result)} results, needed one" ) return result[0]
def fix_urls(el: Element, base_url: str, broken_urls: List[str], urls_to_change: dict) -> Tuple[Callable, List[str]]: """ Given an HTML element, turns all ``href`` parameters of ``a`` elements inside it into fully-qualified absolute URLs instead of the relative paths that are common in the tips content. :arg Element el: ``lxml.html.Element`` object, the content to change. :arg str base_url: The URL for the page, which serves as the absolute point with which to calculate the absolute paths. :arg list broken_urls: The list of broken URLs to add to as we find them. :arg dict[str, str] urls_to_change: Known broken URLs and their replacements. :rtype: tuple[Element, list] :returns: The Element with its ``a`` elements altered, and the list of broken URLs. """ tested_urls = [] # type: List[str] for desc in el.iterdescendants(): if desc.tag == "a" and "href" in desc.attrib: fixed_url, tested_urls, broken_urls = fix_url( base_url, desc.attrib["href"], tested_urls, broken_urls, urls_to_change) desc.attrib["href"] = fixed_url return (el, broken_urls)
def convert_ga(self, doc: html.Element) -> Dict: t = doc.findall(".//table") if len(t) == 0: return { "error": "no tables -> page layout changed", "at": udatetime.now_as_utc() } data = self._htmltable_to_dict(t[0]) if len(data["data"]) != 2: return { "error": "expected two data rows", "at": udatetime.now_as_utc() } if data["data"][0]["COVID-19 Confirmed Cases"] != "Total": return { "error": "first row should be totals", "at": udatetime.now_as_utc() } if data["data"][1]["COVID-19 Confirmed Cases"] != "Deaths": return { "error": "second row should be deaths", "at": udatetime.now_as_utc() } positive = data["data"][0]["No. Cases (%)"] positive = int(positive[0: positive.index("(")]) deaths = data["data"][1]["No. Cases (%)"] deaths = int(deaths[0: deaths.index("(")]) data = self._htmltable_to_dict(t[1]) if len(data["data"]) != 2: return { "error": "expected two data rows", "at": udatetime.now_as_utc() } if data["data"][0]["Lab"] != "Commercial Lab": return { "error": "first row should be Commerial Lab", "at": udatetime.now_as_utc() } if data["data"][1]["Lab"] != "GPHL": return { "error": "second row should be GPHL", "at": udatetime.now_as_utc() } lab_1 = int(data["data"][0]["Total Tests"]) lab_2 = int(data["data"][1]["Total Tests"]) tests = lab_1 + lab_2 return { "positive": positive, "tests": tests, "deaths": deaths }
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError("string required") accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text, ) if create_parent: if not isinstance(create_parent, _strings): create_parent = "div" new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError("No elements found") if len(elements) > 1: raise etree.ParserError("Multiple elements found") result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError("Element followed by text: %r" % result.tail) result.tail = None return result
def innerhtml(el: Element, encoding: str = "utf-8") -> str: """ Returns the HTML of an element as a ``str``, with the opening and closing tags removed. :arg Element el: ``lxml.html.Element`` object. :arg str encoding: The character encoding for the HTML. :rtype: str :returns: A string of HTML without the opening and closing tags. """ children = [_ for _ in el.iterchildren()] if not len(children): return el.text_content() text = "%s" % el.text if el.text else "" return "%s%s" % (text, "".join( [tostring(c).decode(encoding) for c in el.iterchildren()]))
def main(): absolute_folder = sys.argv[1] pattern = '*.iml' fileList = [] # Walk through directory for dName, sdName, fList in os.walk(absolute_folder): for fileName in fList: if fnmatch.fnmatch(fileName, pattern): # Match search string fileList.append(os.path.join(dName, fileName)) pbar = ProgressBar(widgets=['Processing :', Percentage(), ' ', Bar(), ' ', ETA()], maxval=len(fileList)).start() fcount = 0 for fileName in fileList: output_dict = generate_empty_dict() eclipse_file_path = os.path.dirname(fileName)+'/.classpath' with open(fileName, 'r') as f: intellij_data = f.read() if not intellij_data: pass intellij_dict = xmltodict.parse(intellij_data) fcount = fcount + 1 # print(intellij_dict) output_dict = addSrcType(intellij_dict, output_dict) output_dict = addCombinedRules(intellij_dict, output_dict) output_dict = addConType(intellij_dict, output_dict) # print json.dumps(intellij_dict) result = bf.etree(output_dict, root=Element('classpath')) #print tostring(result) with open(eclipse_file_path, 'w') as f: data = tostring(result, doctype='<?xml version="1.0" encoding="UTF-8"?>') data = data.replace('<classpath>','') data = data.replace('</classpath>', '') data = data.replace('<?xml version="1.0" encoding="UTF-8"?>', '<?xml version="1.0" encoding="UTF-8"?><classpath>') data = data +'</classpath>' f.write(data) # Add .project file project_path = os.path.dirname(fileName)+'/.project' xml_data = """<?xml version="1.0" encoding="UTF-8"?> <projectDescription> <name>%s</name> <comment/> <projects/> <buildSpec> <buildCommand> <name>org.eclipse.jdt.core.javabuilder</name> <arguments/> </buildCommand> </buildSpec> <natures> <nature>org.eclipse.jdt.core.javanature</nature> </natures> </projectDescription>""" root_name = os.path.splitext(os.path.basename(fileName))[0] xml_data = xml_data%(root_name) with open(project_path, 'w') as f: f.write(xml_data) pbar.update(fcount) pbar.finish()
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text) if create_parent: if not isinstance(create_parent, _strings): create_parent = 'div' new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError('No elements found') if len(elements) > 1: raise etree.ParserError('Multiple elements found') result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
def postprocess(doc): # put html lang doc.attrib["lang"] = "mk" # add meta charset doc.cssselect("head")[0].insert(0, Element("meta", attrib={"charset": "utf-8"})) # hotlink customized css doc.cssselect("head link")[0].attrib["href"] = "stylesheets/timeline-setter-custom.css" # add title title = Element("title") title.text = u"Слободен софтвер Македонија низ годините" doc.cssselect("head")[0].insert(0, title) # add header header = load_partial("header.html") doc.cssselect("body")[0].insert(0, header) return doc
def clean(self, element): cleanElement = None dropEmpty = ('span', 'p', 'div') downloadDir = self.task.getProperty('download') if 'img' == element.tag: src = urlparse.urljoin(self.url, element.attrib['src']) file, info = urllib.urlretrieve(src) url = urlparse.urlparse(src) disposition = info.getheader('Content-Disposition') filename = None if disposition: type, filename = disposition.split(';') key, filename = filename.split('=') filename = filename.strip('"') if not filename: filename = os.path.basename(file) splitf = filename.split('.') lenf = len(splitf) ext = splitf.pop() if lenf < 2 or info.subtype != ext: filename = '.'.join((filename, info.subtype)) element.attrib['src'] = filename os.rename(file, '/'.join((downloadDir, filename))) #moin specific hack for now if 'a' == element.tag and '/Category' in element.attrib['href']: pass elif element.tag not in dropEmpty \ or bool(element.getchildren()) \ or (bool(element.text) \ and bool(element.text.strip())): cleanElement = Element(element.tag) cleanElement.text = element.text stripattribs = ('class', 'style', 'id') for a in element.attrib: if a not in stripattribs: cleanElement.set(a, element.attrib[a]) for e in element.getchildren(): clean = (self.clean(e)) if clean is not None: cleanElement.append(clean) return cleanElement
def get_user_rating(username, data): if not data: element = Element('span') element.text = username return element rating = data[1] element = Element('a', {'class': 'rate-group', 'href': reverse('user_page', args=[username])}) if rating: rating_css = rating_class(rating) rate_box = Element('span', {'class': 'rate-box ' + rating_css}) rate_box.append(Element('span', {'style': 'height: %3.fem' % rating_progress(rating)})) user = Element('span', {'class': 'rating ' + rating_css}) user.text = username element.append(rate_box) element.append(user) else: element.text = username return element
def view_selection(self, req, resp, url): """ View the highlighted selector (from `action_view`) """ from deliverance.selector import Selector doc = document_fromstring(resp.body) el = Element('base') el.set('href', posixpath.dirname(url) + '/') doc.head.insert(0, el) selector = Selector.parse(req.GET['selector']) dummy_type, elements, dummy_attributes = selector(doc) if not elements: template = self._not_found_template else: template = self._found_template all_elements = [] els_in_head = False for index, el in enumerate(elements): el_in_head = self._el_in_head(el) if el_in_head: els_in_head = True anchor = 'deliverance-selection' if index: anchor += '-%s' % index if el.get('id'): anchor = el.get('id') ## FIXME: is a <a name> better? if not el_in_head: el.set('id', anchor) else: anchor = None ## FIXME: add :target CSS rule ## FIXME: or better, some Javascript all_elements.append((anchor, el)) if not el_in_head: style = el.get('style', '') if style: style += '; ' style += '/* deliverance */ border: 2px dotted #f00' el.set('style', style) else: el.set('DELIVERANCE-MATCH', '1') def highlight(html_code): """Highlights the given code (for use in the template)""" if isinstance(html_code, _Element): html_code = tostring(html_code) return html(pygments_highlight(html_code, HtmlLexer(), HtmlFormatter(noclasses=True))) def format_tag(tag): """Highlights the lxml HTML tag""" return highlight(tostring(tag).split('>')[0]+'>') def wrap_html(html, width=100): if isinstance(html, _Element): html = tostring(html) lines = html.splitlines() new_lines = [] def wrap_html_line(line): if len(line) <= width: return [line] match_trail = re.search(r'^[^<]*</.*?>', line, re.S) if match_trail: result = [match_trail.group(0)] result.extend(wrap_html_line(line[match_trail.end():])) return result match1 = re.search(r'^[^<]*<[^>]*>', line, re.S) match2 = re.search(r'<[^>]*>[^<>]*$', line, re.S) if not match1 or not match2: return [line] result = [match1.group(0)] result.extend(wrap_html_line(line[match1.end():match2.start()])) result.append(match2.group(0)) return result for line in lines: new_lines.extend(wrap_html_line(line)) return '\n'.join(new_lines) def mark_deliv_match(highlighted_text): result = re.sub(r'(?:<[^/][^>]*>)*<.*?DELIVERANCE-MATCH=.*?>(?:</[^>]*>)*', lambda match: r'<b style="background-color: #ff8">%s</b>' % match.group(0), unicode(highlighted_text), re.S) return html(result) text = template.substitute( base_url=url, els_in_head=els_in_head, doc=doc, elements=all_elements, selector=selector, format_tag=format_tag, highlight=highlight, wrap_html=wrap_html, mark_deliv_match=mark_deliv_match) message = fromstring( self._message_template.substitute(message=text, url=url)) if doc.body.text: message.tail = doc.body.text doc.body.text = '' doc.body.insert(0, message) text = tostring(doc) return Response(text)
perc = float(output[1].split(':')[1].split('%')[0]) gcov = output[2].strip().split()[1].strip("'") # move generated gcov to coverage folder new_dir = os.path.join(target_dir, os.path.dirname(source)) try: os.makedirs(new_dir) except OSError: pass os.rename(os.path.join(obspy_dir, gcov), os.path.join(new_dir, gcov)) cov.append((filename, os.path.join(new_dir, gcov), perc)) # GENERATE HTML page = fromstring("<html><table></table></html>") table = page.xpath('.//table')[0] for name, gcov, perc in cov: td1, td2 = Element('td'), Element('td') gcov = gcov.replace(target_dir, './') a = Element('a', attrib={'href': gcov}) a.text = name td1.append(a) td2.text = "%6.2f%%" % perc tr = Element('tr') tr.extend([td1, td2]) table.append(tr) with open(os.path.join(target_dir, 'index.html'), 'wb') as fp: fp.write(tostring(page)) cleanup('*.o')