def link_fix(src, fix): h = html5.fragment_fromstring(src, create_parent='div') links = h.xpath('//h:a[@href]', namespaces={'h':'http://www.w3.org/1999/xhtml'}) for link in links: if link.attrib['href'].startswith('wiki:'): link.attrib['href'] = fix(urlparse.quote(link.attrib['href'][5:])) return serialize_fragment(h)
def next_hours_days(self, location: AnyLocation) -> Optional[NextHoursDaysData]: city_id = self.location_cache.get_station_id(location) if city_id is None: return url = f'{self.AJAX_URL}/weathernexthoursdays?city_id={city_id}&lang=de&units=de&tf=1' with self.get_session() as session: with session.get(url) as resp: status_code = resp.status_code if resp.status_code not in range(200, 300): logger.warn( 'Could not retrieve date for url=%r, status_code=%r', url, status_code) return content = resp.text root = fragment_fromstring(content, create_parent='root') data_hours = xpath( root, f'/root/xhtml:div[{has_class("nexthours-scroll")}]/xhtml:div/xhtml:div', ) data_hours = [parse_hour(hour_data) for hour_data in data_hours] data_days = xpath( root, f'/root/xhtml:div[{has_class("day-row")}]/xhtml:div/xhtml:div') data_days = [parse_day(day_data) for day_data in data_days] return NextHoursDaysData(data_hours, data_days)
def do_update(self, differences): self.index = self.index.refresh() with self.index.writer() as writer: for page, difference in differences.items(): op, content = difference if content: content = content.decode('utf-8') if content.startswith('=> '): redirect_to = content[3:].strip() doc = { 'filename': page, 'url': filename_to_url(page), 'title': filename_to_title(page), 'content': '', 'redirect_to': redirect_to } else: src = html5.fragment_fromstring(self.site.to_html(content), create_parent='div') content = ' '.join(src.xpath('//text()')) doc = { 'filename': page, 'url': filename_to_url(page), 'title': filename_to_title(page), 'content': content, 'redirect_to': None } if op == 'created': writer.add_document(**doc) elif op == 'updated': writer.update_document(**doc) elif op == 'deleted': writer.delete_by_term('filename', page)
def find_iter(skeleton, document): """ Return an iterator that yields elements from the document that match given skeleton. See `find_all` for details. """ if is_string(document): document = html5parser.document_fromstring(document) if is_string(skeleton): skeleton = html5parser.fragment_fromstring(skeleton) for element in document.iterdescendants(): if node_matches_bone(element, skeleton): yield element
def index_content(page, content): content = content.decode('utf-8') if content.startswith('=> '): c.execute('INSERT INTO redirects (source, target) VALUES (?, ?)', (page, content[3:].strip())) return src = html5.fragment_fromstring(self.site.to_html(content), create_parent='div') links = src.xpath('//h:a[@href]', namespaces=ns) page_links = set() for link in links: dest = link.attrib['href'] destinfo = urlparse(dest) if dest.startswith('wiki:'): dest_name = url_to_filename(dest[5:]) page_links.add((page, dest_name)) elif dest.rstrip('/') == self.site.base_url.rstrip('/'): page_links.add((page, 'Homepage')) c.executemany('INSERT INTO links (source, target) VALUES (?, ?)', list(page_links))
def convert_link_to_latex(instr): dom = html5parser.fragment_fromstring(instr) return u'\\href{%s}{%s}' % (dom.get('href'), dom.text)
def convert_link_to_latex(instr): dom = html5parser.fragment_fromstring(instr) return u'\\href{%s}{%s}' % (latex_escape(dom.get('href'), ignore_math=True), dom.text)
def call_it(self, *args, **kwargs): from lxml.html.html5parser import fragment_fromstring return fragment_fromstring(*args, **kwargs)
def html_diff(a, b, insertion=default_ins, deletion=default_del): aleaves, bleaves = (leaves(fragment_fromstring(x, create_parent='div')) for x in (a, b)) dleaves = diffleaves(aleaves, bleaves) return leaves2html(dleaves, insertion, deletion)
def convert_link_to_latex(instr): dom = html5parser.fragment_fromstring(instr) return u'\\href{%s}{%s}' % (dom.get('href'), dom.text)
def sanitize_html(html): html = html5parser.fragment_fromstring(html, create_parent="div") html = cleaner.clean_html(tostring(html)).decode() return html
def call_it(self, *args, **kwargs): from lxml.html.html5parser import fragment_fromstring return fragment_fromstring(*args, **kwargs)
def sanitize_html(src): h = html5.fragment_fromstring(src, create_parent='div') brs = h.xpath('//h:br[count(following-sibling::node()) = 0]', namespaces={'h':'http://www.w3.org/1999/xhtml'}) for br in brs: br.getparent().remove(br) return serialize_fragment(h)