def write_as_html(self, foutput, name: str, url: str, tables: List[ContentTable], html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) for t in tables: s.append(t.new_element) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) h = html.Element("html") h.append(html.Element("body")) h[0].append(deepcopy(s)) foutput.write(html.tostring(h, pretty_print=True)) html_doc.append(s) html_doc.append(html.Element("hr"))
def write_miss_to_html(self, name: str, url: str, msg: str, html_doc: html.Element): s = html.Element("div") h = html.Element("h1") h.text = name s.append(h) m = html.Element("div") m.text = self.cache.read_date_time_str(name + ".html") s.append(m) m = html.Element("span") m.text = msg s.append(m) x = html.Element("br") s.append(x) a = html.Element("a") a.attrib["href"] = url a.text = url s.append(a) html_doc.append(s) html_doc.append(html.Element("hr"))
def get_progressbar_element(percentage): progressbar_child_element = Element("div") percentage = int(percentage) if percentage >= 70: # green progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-success progress-bar-striped" elif percentage >= 50: progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-warning progress-bar-striped" else: progressbar_child_element.attrib[ "class"] = "progress-bar progress-bar-danger progress-bar-striped" progressbar_child_element.attrib["role"] = "progressbar" progressbar_child_element.attrib[ "aria-valuenow"] = "10" # "{}".format(percentage) progressbar_child_element.attrib["aria-valuemin"] = "0" progressbar_child_element.attrib["aria-valuemax"] = "100" progressbar_child_element.attrib[ "style"] = "width:{}%; text-align:left; padding-left: 5px;".format( percentage) progressbar_child_element.text = "{}%".format(percentage) progressbar_element = Element("div") progressbar_element.attrib["class"] = "progress" progressbar_element.attrib["style"] = "margin-bottom:0;" progressbar_element.append(progressbar_child_element) return progressbar_element
def process_img(self, doc, el): """ Process <img> tag in the source document. """ self.add_alt_tags(el) # Skip over images with the nomobileresize attribute if el.attrib.pop("nomobileresize", "") != "": return src = el.attrib.get("src", None) if src: originalSrc = src site = getSite() # catch exceptions to ensure broken images don't # prevent the page from rendering try: src = self.rewrite(src) shorturl = getUtility(IMobileImageShortURLStorage) key = shorturl.getkey(src) if key is None: key = shorturl.suggest() # just check that suggest() is working as expected assert shorturl.get(key) is None shorturl.add(key, src) src = '%s/@@shortimageurl/%s' % (site.absolute_url(), key) el.attrib["src"] = src except: # blank alt text del el.attrib["alt"] el.attrib["src"] = src error = ['src: %s' % src, 'URL: %s' % site.REQUEST.URL, 'Referer: %s' % site.REQUEST.HTTP_REFERER, 'User Agent: %s' % site.REQUEST.get('HTTP_USER_AGENT', 'Unknown'), traceback.format_exc()] # Stop logging image processing errors, it creates # unnecessary noise in the error log # error = '\n'.join(error) # LOG.info(error) # Make image clickable and point to original src a = Element('a') a.attrib['href'] = originalSrc el.getparent().replace(el, a) a.append(el) # Remove explicit width declarations if "width" in el.attrib: del el.attrib["width"] if "height" in el.attrib: del el.attrib["height"] if self.needs_clearing(el): self.clear_floats(el) self.add_processed_class(el)
def _dumps_xml_from_pml_nodes(root_node): node_name, attributes, sub_nodes = root_node element = Element(node_name, **attributes) for sub_node in sub_nodes: element.append(_dumps_xml_from_pml_nodes(sub_node)) return element
def footer(self, node): """Create a standard footer block for HTML files.""" footer = Element('footer') footer.append(E.HR()) footer.append( E. P("Generated automatically from {source} at {time:%d %b %Y %H:%M}." .format(source=node.sourcefile, time=datetime.datetime.now())), ) return footer
def get_user(username, data): if not data: element = Element('span') element.text = username return element element = Element('span', {'class': Profile.get_user_css_class(*data)}) link = Element('a', {'href': reverse('user_page', args=[username])}) link.text = username element.append(link) return element
def get_user(username, data): if not data: element = Element('span') element.text = username return element element = Element('span', {'class': Profile.get_user_css_class(*data)}) link = Element('a', {'href': reverse('user_page', args=[username])}) link.text = username element.append(link) return element
def wrap_set(dom, child_tag, parent_tag): """Wrap unbroken sets of elements in a parent container: - <li> in a <ul> - <tr> in a <table> """ nxt = 0 for e in dom.cssselect(child_tag): if nxt != e: box = Element(parent_tag) insert(box, e) box.append(e) nxt = parent(e).getnext() if nxt is None: nxt = e.getnext()
def _apply_headers_anchors(html: str) -> str: root_element = fromstring(wrap_unwrap_fake_tag(html)) for element in root_element: if element.tag in HEADERS: id_ = make_header_id(element.text) a_element = Element('a', {'id': id_, 'href': f'#{id_}'}) span_element = Element('span', attrib={'class': 'iconify', 'data-icon': HTMLGen.ANCHOR_LINK_ICON_CLASS}) a_element.append(span_element) element.text += ' ' element.insert(0, a_element) html = tostring(root_element) html = wrap_unwrap_fake_tag(html, wrap=False) return html
def wrap_set(dom, child_tag, parent_tag): """Wrap unbroken sets of elements in a parent container: - <li> in a <ul> - <tr> in a <table> """ nxt = 0 for e in dom.cssselect(child_tag): if nxt != e: box = Element(parent_tag) insert(box, e) box.append(e) nxt = parent(e).getnext() if nxt is None: nxt = e.getnext()
def _add_html_info_row(self, t: html.Element, label: str, val: str, cls: str = None): tr = html.Element("tr") td = html.Element("td") td.text = label if cls != None: td.attrib["class"] = cls tr.append(td) td = html.Element("td") td.text = val if cls != None: td.attrib["class"] = cls tr.append(td) tr.tail = "\n " t.append(tr)
def brs_to_paragraphs(tree, inline_tags=None): """ Return an lxml tree with all <br> elements stripped and paragraphs put in place where necessary. """ # add these tags to p's that we're currently building, any other tags will # close the current p inline_tags = inline_tags or ['a'] # if this tree doesn't have any child elements, just return it as is if len(tree) == 0: return tree # if this tree doesn't contain any <br> tags, we don't need to touch it if tree.find('.//br') is None: return tree # XXX: We're building a whole new tree here and leaving out any attributes. # A) That might be a little slower and more memory intensive than modifying # the tree in place, and B) we're dropping any attributes on block elements. # The latter is probably fine for current use, but certainly not ideal. new_tree = Element(tree.tag) # if this tree starts out with text, create a new paragraph for it, and # add it to the tree if tree.text: p = E.P() p.text = tree.text new_tree.append(p) for e in tree: if e.tag == 'br': # avoid adding empty p elements if e.tail is None: continue # start a new p p = E.P() p.text = e.tail new_tree.append(p) # if this is a block tag, and it has trailing text, that text needs to # go into a new paragraph... only if the tail has actual content and # not just whitespace though. elif e.tail and re.match('[^\s]', e.tail) and e.tag not in inline_tags: p = E.P() p.text = e.tail e.tail = '' new_tree.append(e) new_tree.append(p) # keep inline tags inside the current paragraph elif e.tag in inline_tags: p.append(e) else: new_tree.append(brs_to_paragraphs(e)) return new_tree
def brs_to_paragraphs(tree, inline_tags=None): """ Return an lxml tree with all <br> elements stripped and paragraphs put in place where necessary. """ # add these tags to p's that we're currently building, any other tags will # close the current p inline_tags = inline_tags or ["a"] # if this tree doesn't have any child elements, just return it as is if len(tree) == 0: return tree # if this tree doesn't contain any <br> tags, we don't need to touch it if tree.find(".//br") is None: return tree # XXX: We're building a whole new tree here and leaving out any attributes. # A) That might be a little slower and more memory intensive than modifying # the tree in place, and B) we're dropping any attributes on block elements. # The latter is probably fine for current use, but certainly not ideal. new_tree = Element(tree.tag) # if this tree starts out with text, create a new paragraph for it, and # add it to the tree if tree.text: p = E.P() p.text = tree.text new_tree.append(p) for e in tree: if e.tag == "br": # avoid adding empty p elements if e.tail is None: continue # start a new p p = E.P() p.text = e.tail new_tree.append(p) # if this is a block tag, and it has trailing text, that text needs to # go into a new paragraph... only if the tail has actual content and # not just whitespace though. elif e.tail and re.match("[^\s]", e.tail) and e.tag not in inline_tags: p = E.P() p.text = e.tail e.tail = "" new_tree.append(e) new_tree.append(p) # keep inline tags inside the current paragraph elif e.tag in inline_tags: p.append(e) else: new_tree.append(brs_to_paragraphs(e)) return new_tree
def clean(self, element): cleanElement = None dropEmpty = ('span', 'p', 'div') downloadDir = self.task.getProperty('download') if 'img' == element.tag: src = urlparse.urljoin(self.url, element.attrib['src']) file, info = urllib.urlretrieve(src) url = urlparse.urlparse(src) disposition = info.getheader('Content-Disposition') filename = None if disposition: type, filename = disposition.split(';') key, filename = filename.split('=') filename = filename.strip('"') if not filename: filename = os.path.basename(file) splitf = filename.split('.') lenf = len(splitf) ext = splitf.pop() if lenf < 2 or info.subtype != ext: filename = '.'.join((filename, info.subtype)) element.attrib['src'] = filename os.rename(file, '/'.join((downloadDir, filename))) #moin specific hack for now if 'a' == element.tag and '/Category' in element.attrib['href']: pass elif element.tag not in dropEmpty \ or bool(element.getchildren()) \ or (bool(element.text) \ and bool(element.text.strip())): cleanElement = Element(element.tag) cleanElement.text = element.text stripattribs = ('class', 'style', 'id') for a in element.attrib: if a not in stripattribs: cleanElement.set(a, element.attrib[a]) for e in element.getchildren(): clean = (self.clean(e)) if clean is not None: cleanElement.append(clean) return cleanElement
def load_info(self, item: ChangeItem, body: html.Element): body.text = "\n " h3 = html.Element("h3") h3.text = item.name h3.tail = "\n\n " body.append(h3) div = html_helpers.make_source_links("extract", item.name, item.source) body.append(div) body[len(body) - 1].tail = "\n " br = html.Element("br") br.tail = "\n " body.append(br)
def get_user_rating(username, rating): element = Element('a', { 'class': 'rate-group', 'href': reverse('user_page', args=[username]) }) if rating: rating_css = rating_class(rating) rate_box = Element('span', {'class': 'rate-box ' + rating_css}) rate_box.append( Element('span', {'style': 'height: %3.fem' % rating_progress(rating)})) user = Element('span', {'class': 'rating ' + rating_css}) user.text = username element.append(rate_box) element.append(user) else: element.text = username return element
def get_user_rating(username, data): if not data: element = Element('span') element.text = username return element rating = data[1] element = Element('a', {'class': 'rate-group', 'href': reverse('user_page', args=[username])}) if rating: rating_css = rating_class(rating) rate_box = Element('span', {'class': 'rate-box ' + rating_css}) rate_box.append(Element('span', {'style': 'height: %3.fem' % rating_progress(rating)})) user = Element('span', {'class': 'rating ' + rating_css}) user.text = username element.append(rate_box) element.append(user) else: element.text = username return element
def massarge_input_file(input_file_name): # test for bad classes # try: bad_classes(input_file_name) # except: # # we dont reall NEED to chaeck for bad classes so wond do anything here # show_error('There was a problem when checking for bad classes.') input_root = html.parse(input_file_name).getroot() # remove the contents div contents_div = input_root.xpath('body/div[@class="Contents-Box"]') if len(contents_div) > 0: contents_div[0].getparent().remove(contents_div[0]) # remove all the _idGenParaOverrides all_paragraphs = input_root.xpath('//p|//h1|//h2|//h3|//h4|//h5|//h6') for paragraph in all_paragraphs: if re.search(r' ?_idGenParraOveride\d\d?\d?', paragraph.get('class', default='')) is not None: print('override') paragraph.set( 'class', re.sub(r' ?_idGenParraOveride\d\d?\d?', '', paragraph.get('class', default=''))) # remove filename for internal hyperlinks # inDesign_file_name = os.path.basename(input_file_name) inDesign_file_name = Path(input_file_name).name all_links = input_root.xpath('//a') for link in all_links: if 'href' in link.attrib: link.attrib['href'] = link.attrib['href'].replace( inDesign_file_name, '') # there are 3 paragraph style with hanging indednts that must be manipulated for paragraph in input_root.xpath( '//p[@class="paraMotionSub-Paragraph" or ' '@class="paraMotionSub-Sub-Paragraph" or ' '@class="paraMotionSub-Sub-Sub-Paragraph"][text()]'): try: split_on_tab = paragraph.text.split('\u0009', 1) span_hanging = Element('span') span_hanging.set('class', 'hanging1') span_hanging.tail = split_on_tab[1] span_hanging.text = split_on_tab[0] paragraph.append(span_hanging) paragraph.text = '' except IndexError: # dont do anything if there is no tab pass # sort out all the bullets bullets = input_root.xpath('//span[@class="pythonFindBullet"]') for bullet in bullets: bullet.drop_tree() # bullet.text = "" # # also turn the strong to a span. This is for FBA where there are tabs between the time and the rest. # next_strong_t = bullet.getnext() # if iselement(next_strong_t) and next_strong_t.tag == 'strong': # # now check that there is a bold class and the next char is a tab # if next_strong_t.get('class') == 'Bold' and next_strong_t.tail and next_strong_t.tail[0] == '\u0009': # next_strong_t.tag = 'span' # # next_strong_t.attrib.pop('class', None) # # next_strong_t.attrib['style'] = 'display : block; float : left; width : 5.7em; height : 1em;' # sort the numbers numbers = input_root.xpath('//p[@class="paraQuestion"]/span[1]') for number in numbers: # cosider changing this in InDesign number.attrib['class'] = 'charBallotNumber' new_span = Element('span') new_span.classes.add('number-span') # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>') number_parent = number.getparent() new_span.append(number) number_parent.insert(0, new_span) # sort ministerial statements statements = input_root.xpath( '//p[@class="paraMinisterialStatement"]/span[1]') for statement in statements: statement.attrib['class'] = 'charItemNumber' statement_tail_text = statement.tail statement.tail = '' new_span = Element('span') new_span.classes.add('number-span') # new_span = html.fromstring('<span style="display : block; float : left; width : 2.1em; height : 1em;"></span>') new_span.tail = statement_tail_text number_parent = statement.getparent() new_span.append(statement) number_parent.insert(0, new_span) # sort the front page tables front_page_tables = input_root.xpath('//table[@class="Front-Page-Table"]') for table in front_page_tables: # added as a result of an accessibility audit table.set('role', 'presentation') front_page_table_colgroups = input_root.xpath( '//table[@class="Front-Page-Table"]/colgroup') for colgroup in front_page_table_colgroups: colgroup[0].attrib.pop("class", None) colgroup[0].attrib['width'] = '24%' colgroup[1].attrib.pop("class", None) colgroup[1].attrib['width'] = '76%' # sort motion sponsor groups sponsor_groups_xpath = '//p[@class="paraMotionSponsorGroup"]' \ '|//p[@class="MotionAmmendmentSponsorGroup"]' \ '|//p[@class="MotionAmmendmentSponsorGroup"]/span' \ '|//p[@class="A2A-SponsorGroup"]' sponsor_groups = input_root.xpath(sponsor_groups_xpath) for sponsor_group in sponsor_groups: # print(html.tostring(sponsor_group)) if not sponsor_group.text: continue sponsor_group.classes.add('row') # split text on the tab character (InDesign puts in) sponosr_names = sponsor_group.text.split('\u0009') sponsor_group.text = None for sponosr_name in sponosr_names: sponsor_span = SubElement(sponsor_group, 'span') sponsor_span.classes.update(('col-12', 'col-sm-6', 'col-lg-4')) sponsor_span.text = sponosr_name # change FBA location to .heading-level-3 # for fba_location_heading in input_root.xpath('//*[@class="FbaLocation"]'): # fba_location_heading.classes.discard('FbaLocation') # fba_location_heading.classes.add('heading-level-3') # <strong class="Bold"> is overkill for strong_ele in input_root.xpath('//strong'): strong_ele.classes.discard('Bold') # dont need <span class="Hyperlink"> in a <a> for span in input_root.xpath('//a/span[@class="Hyperlink"]'): span.drop_tag() # seems like sometimes there are empty span.charStandingOrderReference for span in input_root.xpath( '//span[@class="charStandingOrderReference"]'): if not span.text or span.text.isspace(): span.drop_tag() # Front-Page-Table doesnt need to be on the table the row and the td for tr in input_root.xpath( '//table[@class="Front-Page-Table"]//tr[@class="Front-Page-Table"]' ): tr.classes.discard('Front-Page-Table') for child in tr.iterchildren('td', 'th'): child.classes.discard('Front-Page-Table') heading_tags = ['h6', 'h5', 'h4', 'h3', 'h2', 'h1'] for i, heading_tag in enumerate(heading_tags): # dont do anything if i ==0 because we have h6 if i != 0: # we will replace heading_tag with heading_tags[i-1] # so h1 -> h2, h2 -> h3 etc. # print('here') new_heading_tag = heading_tags[i - 1] for heading in input_root.xpath(f'//{heading_tag}'): heading.tag = new_heading_tag # return the modified input html root element return input_root
perc = float(output[1].split(':')[1].split('%')[0]) gcov = output[2].strip().split()[1].strip("'") # move generated gcov to coverage folder new_dir = os.path.join(target_dir, os.path.dirname(source)) try: os.makedirs(new_dir) except OSError: pass os.rename(os.path.join(obspy_dir, gcov), os.path.join(new_dir, gcov)) cov.append((filename, os.path.join(new_dir, gcov), perc)) # GENERATE HTML page = fromstring("<html><table></table></html>") table = page.xpath('.//table')[0] for name, gcov, perc in cov: td1, td2 = Element('td'), Element('td') gcov = gcov.replace(target_dir, './') a = Element('a', attrib={'href': gcov}) a.text = name td1.append(a) td2.text = "%6.2f%%" % perc tr = Element('tr') tr.extend([td1, td2]) table.append(tr) with open(os.path.join(target_dir, 'index.html'), 'wb') as fp: fp.write(tostring(page)) cleanup('*.o')
def _add_data_row(self, t: html.Element, x: ChangeItem, kind: str): # { # "name": "AK.html", # "status": "unchanged", # "url": "http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx", # "msg": null, # "complete": true, # "added": "2020-03-13T06:17:50.550545", # "checked": "2020-03-16T22:00:07.143700", # "updated": "2020-03-16T21:40:10.611841", # "failed": null, # "source": "google-states" # } name = x.name status = x.status if name == "main_sheet.html": return if name.endswith("_data.html") and status == "duplicate": return prefix = "\n " tr = html.Element("tr") tr.tail = prefix # Name td = html.Element("td") td.tail = prefix a = html.Element("a") a.attrib["href"] = name a.text = name.replace(".html", "") td.append(a) tr.append(td) t.append(tr) # Status td = html.Element("td") td.tail = prefix td.attrib["class"] = status td.text = status tr.append(td) # Last Changed updated_at = x.updated failed_at = x.failed td = html.Element("td") td.tail = prefix if failed_at != None: td.attrib["class"] = "failed" td.text = udatetime.to_displayformat(failed_at) else: td.text = udatetime.to_displayformat(updated_at) tr.append(td) # Delta td = html.Element("td") td.tail = prefix v = updated_at if failed_at == None else failed_at td.text = udatetime.format_difference(self.start_date, v) if status != "CHANGED" else "" tr.append(td) t.append(tr) # Live Page url = x.url td = html.Element("td") td.tail = prefix a = html.Element("a") a.attrib["href"] = url if len(url) < 80: a.text = url else: a.text = url[0: 80] + " ..." a.attrib["class"] = "tooltip" s = html.Element("span") s.text = url s.attrib["class"] = "tooltiptext" a.append(s) td.append(a) tr.append(td) # Pipeline source = x.source if source == None: source = "google-states" td = html.Element("td") td.tail = prefix[:-2] div = html_helpers.make_source_links(kind, name, source) td.append(div) tr.append(td) t.append(tr)
# read stdout filename = fp.readline().strip().split()[1].strip("'") perc = float(fp.readline().split(':')[1].split('%')[0]) gcov = fp.readline().strip().split()[1].strip("'") # move genereted gcov to coverage folder new_dir = join(target_dir, dirname(source)) try: makedirs(new_dir) except OSError: pass rename(join(obspy_dir, gcov), join(new_dir, gcov)) cov.append((filename, join(new_dir, gcov), perc)) # GENERATE HTML page = fromstring("<html><table></table></html>") table = page.xpath('.//table')[0] for name, gcov, perc in cov: td1, td2 = Element('td'), Element('td') gcov = gcov.replace(target_dir, './') a = Element('a', attrib={'href': gcov}) a.text = name td1.append(a) td2.text = "%6.2f%%" % perc tr = Element('tr') tr.extend([td1, td2]) table.append(tr) with open(join(target_dir, 'index.html'), 'wb') as fp: fp.write(tostring(page)) cleanup('*.o')
def message_proc(message): url = f"https://t.me/s/{message.get('data-post')}" author = message.xpath('.//span[@class="tgme_widget_message_from_author"]') or '' if author: author = author[0].text_content() date = message.xpath('.//a[@class="tgme_widget_message_date"]/time')[0] date = datetime.datetime.fromisoformat(date.get('datetime')) text = message.xpath('.//div[starts-with(@class, "tgme_widget_message_text ")]')[0] del text.attrib['class'] content = tostring(text, encoding=str).strip().replace('\r', '') reply = message.xpath('.//a[@class="tgme_widget_message_reply"]') if reply: reply = reply[0] reply.tag = 'div' reply[0].tag = 'a' reply[0].set('href', reply.get('href').replace('https://t.me/', 'https://t.me/s/')) del reply.attrib['href'] content = "<blockquote>%s</blockquote>" % tostring(reply, encoding=str).strip().replace('\r', '') + content linkpreview = message.xpath('.//a[@class="tgme_widget_message_link_preview"]') if linkpreview: linkpreview = linkpreview[0] linkpreview.tag = 'div' sitename = linkpreview.xpath('.//div[@class="link_preview_site_name"]')[0] sitediv = Element('div') sitestrong = Element('strong') sitestrong.text = sitename.text_content() sitediv.append(sitestrong) sitename.getparent().replace(sitename, sitediv) previewtitle = linkpreview.xpath('.//div[@class="link_preview_title"]')[0] previewtitle.tag = 'a' previewtitle.set('href', linkpreview.get('href')) del linkpreview.attrib['href'] image = linkpreview.xpath('.//i[@class="link_preview_right_image"]') if image: image = image[0] image.tag = 'img' image.set('src', image.attrib.pop('style').split("'")[1]) image.set('style', 'max-height: 5em;') content += "<blockquote>%s</blockquote>" % tostring(linkpreview, encoding=str).strip().replace('\r', '') content_text = text.text_content() if len(content_text) > 30: title = "%s……" % (content_text[:30]) else: title = content_text item = PyRSS2Gen.RSSItem( title = title, link = url, guid = url, description = content, author = author, pubDate = date, ) return item