Exemple #1
0
Fichier : util.py Projet : dpk/ikwi
def link_fix(src, fix):
    h = html5.fragment_fromstring(src, create_parent='div')
    links = h.xpath('//h:a[@href]', namespaces={'h':'http://www.w3.org/1999/xhtml'})
    for link in links:
        if link.attrib['href'].startswith('wiki:'):
            link.attrib['href'] = fix(urlparse.quote(link.attrib['href'][5:]))
    return serialize_fragment(h)
    def next_hours_days(self,
                        location: AnyLocation) -> Optional[NextHoursDaysData]:
        city_id = self.location_cache.get_station_id(location)
        if city_id is None:
            return

        url = f'{self.AJAX_URL}/weathernexthoursdays?city_id={city_id}&lang=de&units=de&tf=1'
        with self.get_session() as session:
            with session.get(url) as resp:
                status_code = resp.status_code
                if resp.status_code not in range(200, 300):
                    logger.warn(
                        'Could not retrieve date for url=%r, status_code=%r',
                        url, status_code)
                    return

                content = resp.text

        root = fragment_fromstring(content, create_parent='root')

        data_hours = xpath(
            root,
            f'/root/xhtml:div[{has_class("nexthours-scroll")}]/xhtml:div/xhtml:div',
        )
        data_hours = [parse_hour(hour_data) for hour_data in data_hours]

        data_days = xpath(
            root,
            f'/root/xhtml:div[{has_class("day-row")}]/xhtml:div/xhtml:div')
        data_days = [parse_day(day_data) for day_data in data_days]

        return NextHoursDaysData(data_hours, data_days)
Exemple #3
0
 def do_update(self, differences):
     self.index = self.index.refresh()
     with self.index.writer() as writer:
         for page, difference in differences.items():
             op, content = difference
             
             if content:
                 content = content.decode('utf-8')
                 if content.startswith('=> '):
                     redirect_to = content[3:].strip()
                     doc = {
                         'filename': page,
                         'url': filename_to_url(page),
                         'title': filename_to_title(page),
                         'content': '',
                         'redirect_to': redirect_to
                     }
                 else:
                     src = html5.fragment_fromstring(self.site.to_html(content), create_parent='div')
                     content = ' '.join(src.xpath('//text()'))
                     doc = {
                         'filename': page,
                         'url': filename_to_url(page),
                         'title': filename_to_title(page),
                         'content': content,
                         'redirect_to': None
                     }
             
             if op == 'created':
                 writer.add_document(**doc)
             elif op == 'updated':
                 writer.update_document(**doc)
             elif op == 'deleted':
                 writer.delete_by_term('filename', page)
Exemple #4
0
def find_iter(skeleton, document):
    """
    Return an iterator that yields elements from the document that
    match given skeleton.

    See `find_all` for details.
    """
    if is_string(document):
        document = html5parser.document_fromstring(document)
    if is_string(skeleton):
        skeleton = html5parser.fragment_fromstring(skeleton)

    for element in document.iterdescendants():
        if node_matches_bone(element, skeleton):
            yield element
Exemple #5
0
 def index_content(page, content):
     content = content.decode('utf-8')
     if content.startswith('=> '):
         c.execute('INSERT INTO redirects (source, target) VALUES (?, ?)', (page, content[3:].strip()))
         return
     
     src = html5.fragment_fromstring(self.site.to_html(content), create_parent='div')
     links = src.xpath('//h:a[@href]', namespaces=ns)
     page_links = set()
     
     for link in links:
         dest = link.attrib['href']
         destinfo = urlparse(dest)
         if dest.startswith('wiki:'):
             dest_name = url_to_filename(dest[5:])
             page_links.add((page, dest_name))
         elif dest.rstrip('/') == self.site.base_url.rstrip('/'):
             page_links.add((page, 'Homepage'))
     
     c.executemany('INSERT INTO links (source, target) VALUES (?, ?)', list(page_links))
Exemple #6
0
def convert_link_to_latex(instr):
    dom = html5parser.fragment_fromstring(instr)
    return u'\\href{%s}{%s}' % (dom.get('href'), dom.text)
Exemple #7
0
def convert_link_to_latex(instr):
    dom = html5parser.fragment_fromstring(instr)
    return u'\\href{%s}{%s}' % (latex_escape(dom.get('href'), ignore_math=True), dom.text)
Exemple #8
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import fragment_fromstring
     return fragment_fromstring(*args, **kwargs)
Exemple #9
0
def html_diff(a, b, insertion=default_ins, deletion=default_del):
  aleaves, bleaves = (leaves(fragment_fromstring(x, create_parent='div')) for x in (a, b))
  dleaves = diffleaves(aleaves, bleaves)
  
  return leaves2html(dleaves, insertion, deletion)
Exemple #10
0
def convert_link_to_latex(instr):
    dom = html5parser.fragment_fromstring(instr)
    return u'\\href{%s}{%s}' % (dom.get('href'), dom.text)
def sanitize_html(html):
    html = html5parser.fragment_fromstring(html, create_parent="div")
    html = cleaner.clean_html(tostring(html)).decode()
    return html
Exemple #12
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import fragment_fromstring
     return fragment_fromstring(*args, **kwargs)
Exemple #13
0
Fichier : util.py Projet : dpk/ikwi
def sanitize_html(src):
    h = html5.fragment_fromstring(src, create_parent='div')
    brs = h.xpath('//h:br[count(following-sibling::node()) = 0]', namespaces={'h':'http://www.w3.org/1999/xhtml'})
    for br in brs: br.getparent().remove(br)
    return serialize_fragment(h)