Exemple #1
0
    def _parse_multiple_apps(self, list_response):
        """Extracts app ids from a list's Response object, sends GET requests to
        each app, parses detailed info and returns all apps in a list.

        :param list_response: the Response object from a list request
        :return: a list of app dictionaries
        """
        soup = html5_parser.parse(list_response.text, treebuilder='soup')

        app_ids = [
            x.attrs['data-docid']
            for x in soup.find_all('span', class_='preview-overlay-container')
        ]
        responses = multi_app_request(app_ids)

        apps = []
        errors = []
        for i, r in enumerate(responses):
            if r is not None and r.status_code == requests.codes.ok:
                # soup = BeautifulSoup(r.text, 'lxml', parse_only=app_strainer)
                soup = html5_parser.parse(r.text, treebuilder='soup')

                apps.append(self._parse_app_details(soup))
            else:
                errors.append(app_ids[i])

        if errors:
            self._log.error(
                "There was an error parsing the following apps: {errors}.".
                format(errors=", ".join(errors)))

        return apps
def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`mimetools.Message`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except ImportError:
        from html5lib import parse
        return parse(data,
                     transport_encoding=transport_encoding,
                     namespaceHTMLElements=False)
    else:
        return parse(data, transport_encoding=transport_encoding)
Exemple #3
0
    def html(self):
        raw = original_html = self.toHtml()
        check = self.toPlainText().strip()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
        raw = self.comments_pat.sub('', raw)
        if not check and '<img' not in raw.lower():
            return ''

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            root = parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True)
        if root.xpath('//meta[@name="calibre-dont-sanitize"]'):
            # Bypass cleanup if special meta tag exists
            return original_html

        try:
            cleanup_qt_markup(root)
        except Exception:
            import traceback
            traceback.print_exc()
        elems = []
        for body in root.xpath('//body'):
            if body.text:
                elems.append(body.text)
            elems += [html.tostring(x, encoding='unicode') for x in body if
                x.tag not in ('script', 'style')]

        if len(elems) > 1:
            ans = '<div>%s</div>'%(''.join(elems))
        else:
            ans = ''.join(elems)
            if not ans.startswith('<'):
                ans = '<p>%s</p>'%ans
        return xml_replace_entities(ans)
Exemple #4
0
def content_parser(data,
                   url=None,
                   response_info=None,
                   transport_encoding=None,
                   default_encoding=DEFAULT_ENCODING,
                   is_html=True):
    '''
    Parse data (a bytes object) into an etree representation such as
    :py:mod:`xml.etree.ElementTree` or `lxml.etree`

    :param bytes data: The data to parse
    :param url: The URL of the document being parsed or None
    :param response_info: Information about the document
        (contains all HTTP headers as :class:`HTTPMessage`)
    :param transport_encoding: The character encoding for the document being
        parsed as specified in the HTTP headers or None.
    :param default_encoding: The character encoding to use if no encoding
        could be detected and no transport_encoding is specified
    :param is_html: If the document is to be parsed as HTML.
    '''
    if not is_html:
        return
    try:
        from html5_parser import parse
    except Exception:
        from html5lib import parse
        kw = {'namespaceHTMLElements': False}
        if transport_encoding and isinstance(data, bytes):
            kw['transport_encoding'] = transport_encoding
        return parse(data, **kw)
    else:
        return parse(data, transport_encoding=transport_encoding)
Exemple #5
0
    def _parse_multiple_apps(self, list_response):
        """Extracts app ids from a list's Response object, sends GET requests to
        each app, parses detailed info and returns all apps in a list.

        :param list_response: the Response object from a list request
        :return: a list of app dictionaries
        """
        soup = html5_parser.parse(list_response.text, treebuilder='soup')

        app_ids = [x.attrs['data-docid'] for x in soup.find_all('span', class_='preview-overlay-container')]
        responses = multi_app_request(app_ids)

        apps = []
        errors = []
        for i, r in enumerate(responses):
            if r is not None and r.status_code == requests.codes.ok:
                # soup = BeautifulSoup(r.text, 'lxml', parse_only=app_strainer)
                soup = html5_parser.parse(r.text, treebuilder='soup')

                apps.append(self._parse_app_details(soup))
            else:
                errors.append(app_ids[i])

        if errors:
            self._log.error("There was an error parsing the following apps: {errors}.".format(
                errors=", ".join(errors)))

        return apps
Exemple #6
0
 def test_doctype(self):
     base = '\n<html><body><p>xxx</p></body></html>'
     for dt in ('html', 'html PUBLIC "-//W3C//DTD HTML 4.01//EN" '
                '"http://www.w3.org/TR/html4/strict.dtd"'):
         dt = '<!DOCTYPE {}>'.format(dt)
         t = parse(dt + base).getroottree()
         self.ae(dt, t.docinfo.doctype)
         t = parse(dt + base, keep_doctype=False).getroottree()
         self.assertFalse(t.docinfo.doctype)
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(
        quote_plus(query))

    counter = max_results
    br = browser()
    raw = br.open(url).read()

    if write_raw_to is not None:
        with open(write_raw_to, 'wb') as f:
            f.write(raw)

    root = parse(raw)
    CSSSelect = Select(root)
    for li in CSSSelect('li.booklink'):
        if counter <= 0:
            break
        counter -= 1

        s = SearchResult()
        a = next(CSSSelect('a.link', li))
        s.detail_item = absurl(a.get('href'))
        s.title = etree.tostring(next(CSSSelect('span.title', li)),
                                 method='text',
                                 encoding='unicode').strip()
        try:
            s.author = etree.tostring(next(CSSSelect('span.subtitle', li)),
                                      method='text',
                                      encoding='unicode').strip()
        except StopIteration:
            s.author = ""
        for img in CSSSelect('img.cover-thumb', li):
            s.cover_url = absurl(img.get('src'))
            break

        # Get the formats and direct download links.
        details_doc = parse(br.open_novisit(s.detail_item).read())
        doc_select = Select(details_doc)
        for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
            for a in doc_select('a.link', tr):
                href = a.get('href')
                type = a.get('type')
                ext = mimetypes.guess_extension(
                    type.split(';')[0]) if type else None
                if href and ext:
                    url = absurl(href.split('?')[0])
                    ext = ext[1:].upper().strip()
                    if ext not in s.downloads:
                        s.downloads[ext] = url
                    break

        s.formats = ', '.join(s.downloads.keys())
        if not s.formats:
            continue

        yield s
Exemple #8
0
 def test_xmlns(self):
     root = parse('<html><p xmlns:foo="f">xxx<f:moo/>')
     self.ae(tostring(root), '<html><head/><body><p xmlns_foo="f">xxx<f_moo/></p></body></html>')
     root = parse('<p xmlns="x"><p xmlns:="y"><svg xmlns:xlink="xxx">')
     self.ae(
         tostring(root), '<html xmlns:xlink="http://www.w3.org/1999/xlink"><head/>'
         '<body><p/><p><svg/></p></body></html>')
     root = parse("""<p a:a="1" xmlns:a="a">""", maybe_xhtml=True)
     p = root[1][0]
     self.ae(p.attrib, {'{a}a': '1'})
Exemple #9
0
 def test_maybe_xhtml(self):
     for tag in 'title script style'.split():
         html = '<html><head><{}/></head><body><p>xxx</p></body></html>'.format(
             tag)
         root = parse(html)
         root = parse(html, maybe_xhtml=True)
         self.ae(len(root[1]), 1)
         html = '<html><head></head><body><{}/><p>xxx</p></body></html>'.format(
             tag)
         root = parse(html, maybe_xhtml=True)
         self.ae(len(root[1]), 2)
Exemple #10
0
 def test_maybe_xhtml(self):
     for tag in 'title script style'.split():
         html = '<html><head><{}/></head><body><p>xxx</p></body></html>'.format(tag)
         root = parse(html)
         root = parse(html, maybe_xhtml=True)
         self.ae(len(root[1]), 1)
         html = '<html><head></head><body><{}/><p>xxx</p></body></html>'.format(tag)
         root = parse(html, maybe_xhtml=True)
         self.ae(len(root[1]), 2)
     root = parse('<title/><title>t</title></title></title><link href="h">', maybe_xhtml=True)
     self.ae(
         tostring(root),
         '<html xmlns="http://www.w3.org/1999/xhtml"><head><title/>'
         '<title>t</title><link href="h"/></head><body/></html>')
Exemple #11
0
    def developer(self, developer, results=None, page=None, detailed=False):
        """Sends a POST request and retrieves a list of the developer's published
        applications on the Play Store.

        :param developer: developer name to retrieve apps from, e.g. 'Disney'
        :param results: the number of app results to retrieve
        :param page: the page number to retrieve
        :param detailed: if True, sends request per app for its full detail
        :return: a list of app dictionaries
        """
        results = s.DEV_RESULTS if results is None else results
        page = 0 if page is None else page
        page_num = (results // 20) * page
        if not 0 <= page_num <= 12:
            raise ValueError(
                'Page out of range. (results // 20) * page must be between 0 - 12'
            )
        pagtok = self._pagtok[page_num]

        url = build_url('developer', developer)
        data = generate_post_data(results, 0, pagtok)
        response = send_request('POST', url, data)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [
                self._parse_card_info(app)
                for app in soup.select('div[data-uitype=500]')
            ]

        return apps
Exemple #12
0
    def implementation(self, fragment_context, html, expected, errors,
                       test_name):
        if fragment_context:
            fragment_context = fragment_context.replace(' ', ':')
        bad = self.check_test(fragment_context, html, expected, errors,
                              test_name)
        if bad is not None:
            raise unittest.SkipTest(bad)

        root = parse(html,
                     namespace_elements=True,
                     sanitize_names=False,
                     fragment_context=fragment_context)
        output = serialize_construction_output(
            root, fragment_context=fragment_context)

        # html5lib doesn't yet support the template tag, but it appears in the
        # tests with the expectation that the template contents will be under the
        # word 'contents', so we need to reformat that string a bit.
        # expected = reformatTemplateContents(expected)

        error_msg = '\n'.join([
            '\n\nInput:', html, '\nExpected:', expected, '\nReceived:', output
        ])
        self.ae(expected, output, error_msg + '\n')
Exemple #13
0
def parse_html5(raw,
                decoder=None,
                log=None,
                discard_namespaces=False,
                line_numbers=True,
                linenumber_attribute=None,
                replace_entities=True,
                fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = entities.xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw,
                              maybe_xhtml=not discard_namespaces,
                              line_number_attr=linenumber_attribute,
                              keep_doctype=False,
                              sanitize_names=True)
    if ((discard_namespaces and root.tag != 'html') or
        (not discard_namespaces and
         (root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
        raise ValueError('Failed to parse correctly, root has tag: %s and '
                         'prefix: %s' % (root.tag, root.prefix))
    return root
Exemple #14
0
    def developer(self, developer, results=None, page=None, detailed=False):
        """Sends a POST request and retrieves a list of the developer's published
        applications on the Play Store.

        :param developer: developer name to retrieve apps from, e.g. 'Disney'
        :param results: the number of app results to retrieve
        :param page: the page number to retrieve
        :param detailed: if True, sends request per app for its full detail
        :return: a list of app dictionaries
        """
        results = s.DEV_RESULTS if results is None else results
        page = 0 if page is None else page
        page_num = (results // 20) * page
        if not 0 <= page_num <= 12:
            raise ValueError('Page out of range. (results // 20) * page must be between 0 - 12')
        pagtok = self._pagtok[page_num]

        url = build_url('developer', developer)
        data = generate_post_data(results, 0, pagtok)
        response = send_request('POST', url, data)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')]

        return apps
Exemple #15
0
def generate_public_uti_map():
    from lxml import etree
    from polyglot.urllib import urlopen
    from html5_parser import parse
    raw = urlopen(
        'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read()
    root = parse(raw)
    tables = root.xpath('//table')[0::2]
    data = {}
    for table in tables:
        for tr in table.xpath('descendant::tr')[1:]:
            td = tr.xpath('descendant::td')
            identifier = etree.tostring(td[0], method='text', encoding=unicode_type).strip()
            tags = etree.tostring(td[2], method='text', encoding=unicode_type).strip()
            identifier = identifier.split()[0].replace('\u200b', '')
            exts = [x.strip()[1:].lower() for x in tags.split(',') if x.strip().startswith('.')]
            for ext in exts:
                data[ext] = identifier
    lines = ['PUBLIC_UTI_MAP = {']
    for ext in sorted(data):
        r = ("'" + ext + "':").ljust(16)
        lines.append((' ' * 4) + r + "'" + data[ext] + "',")
    lines.append('}')
    with open(__file__, 'r+b') as f:
        raw = f.read()
        f.seek(0)
        nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL)
        f.truncate(), f.write(nraw)
Exemple #16
0
def html5_parse(raw,
                decoder=None,
                log=None,
                discard_namespaces=False,
                line_numbers=True,
                linenumber_attribute=None,
                replace_entities=True,
                fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)
    from html5_parser import parse
    root = parse(raw,
                 maybe_xhtml=not discard_namespaces,
                 line_number_attr=linenumber_attribute,
                 keep_doctype=False)
    if (discard_namespaces and root.tag != 'html') or (
            not discard_namespaces and
        (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError(
            'Failed to parse correctly, root has tag: %s and prefix: %s' %
            (root.tag, root.prefix))
    return root
Exemple #17
0
 def test_dom(self):
     root = parse(HTML, treebuilder='dom', namespace_elements=True)
     doc = root.ownerDocument
     self.ae(doc.doctype, DOCTYPE)
     self.ae(root.tagName, 'html')
     self.ae(
         dict(root.attributes.itemsNS()),
         dict([((u'xmlns', u'xmlns'), 'http://www.w3.org/1999/xhtml'),
               ((u'xmlns', u'xlink'), 'http://www.w3.org/1999/xlink'),
               ((None, u'xml_lang'), 'en'), ((None, u'lang'), 'en')]))
     script = doc.getElementsByTagName('script')[0]
     self.ae(script.firstChild.nodeValue, 'a < & " b')
     p = doc.getElementsByTagName('p')[0]
     self.ae(p.toxml(), '<p>A <span>test</span> of text and tail\n</p>')
     p = doc.getElementsByTagName('p')[-1]
     self.ae(dict(p.attributes.itemsNS()),
             dict([((None, u'xml_lang'), 'de')]))
     svg = doc.getElementsByTagName('svg')[0]
     self.ae(
         dict(svg.attributes.itemsNS()), {
             (None, 'viewBox'): 'v',
             (u'xmlns', u'xmlns'): 'http://www.w3.org/2000/svg'
         })
     self.ae(dict(svg.firstChild.attributes.itemsNS()),
             dict([((XLINK, u'href'), 'h')]))
     self.ae(root.lastChild.nodeValue, COMMENT.replace('--', '\u2014'))
Exemple #18
0
def insert_style(html, css):
    """
    Insert a new <style> tag with CSS.

    Parameters
    ----------
    html : string
    css : string

    Returns
    -------
    render : string
    """
    soup = html5_parser.parse(html, treebuilder='soup', return_root=False)

    # Ensure html includes a <head></head>.
    if not soup.head:
        head = soup.new_tag('head')
        soup.html.insert(0, head)

    style_tag = soup.new_tag("style", type="text/css")
    style_tag.string = css
    soup.head.append(style_tag)
    render = soup.prettify(formatter=None)
    return render
Exemple #19
0
    def search(self, query, page=None, detailed=False):
        """Sends a POST request and retrieves a list of applications matching
        the query term(s).

        :param query: search query term(s) to retrieve matching apps
        :param page: the page number to retrieve. Max is 12.
        :param detailed: if True, sends request per app for its full detail
        :return: a list of apps matching search terms
        """
        page = 0 if page is None else page
        if page > len(self._pagtok) - 1:
            raise ValueError('Page out of range. Please choose a number between 0 - 12')

        pagtok = self._pagtok[page]
        data = generate_post_data(0, 0, pagtok)

        params = {
            'q': quote_plus(query),
            'c': 'apps'
        }

        response = send_request('POST', self._search_url, data, params)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')]

        return apps
Exemple #20
0
        def test_matching(self):
            root = parse(namespace_elements=True,
                         html='''
<html id='root'>
<head id='head'></head>
<body id='body'>
<p class="one red" id='p1'>simple
<p class="two green" id='p2'>a'b"c
''')
            all_ids = root.xpath('//*/@id')

            def q(mt, query=''):
                r = Rule({'match_type': mt, 'query': query, 'actions': []})
                ans = []
                for tag in r.selector(root):
                    ans.append(tag.get('id'))
                return ans

            def t(mt, query='', expected=[]):
                self.ae(expected, q(mt, query))

            t('*', expected=all_ids)
            t('is', 'body', ['body'])
            t('is', 'p', ['p1', 'p2'])
            t('has_class', 'one', ['p1'])
            ei = list(all_ids)
            ei.remove('p1')
            t('not_has_class', 'one', ei)
            t('css', '#body > p.red', ['p1'])
            t('xpath', '//h:body', ['body'])
            t('contains_text', 'imple', ['p1'])
            t('contains_text', 'a\'b"c', ['p2'])
Exemple #21
0
def get_images_from_polyglot_text(txt: str,
                                  base_dir: str = '',
                                  file_ext: str = 'txt') -> set:
    from calibre.ebooks.oeb.base import OEB_IMAGES
    from calibre import guess_type
    if not base_dir:
        base_dir = os.getcwd()
    images = set()

    def check_path(path: str) -> None:
        if path and not os.path.isabs(path) and guess_type(
                path)[0] in OEB_IMAGES and os.path.exists(
                    os.path.join(base_dir, path)):
            images.add(path)

    if file_ext in ('txt', 'text', 'textile'):
        # Textile
        for m in re.finditer(
                r'(?mu)(?:[\[{])?\!(?:\. )?(?P<path>[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))',
                txt):
            path = m.group('path')
            check_path(path)

    if file_ext in ('txt', 'text', 'md', 'markdown'):
        # Markdown
        from markdown import Markdown
        html = HTML_TEMPLATE % ('', Markdown().convert(txt))
        from html5_parser import parse
        root = parse(html)
        for img in root.iterdescendants('img'):
            path = img.get('src')
            check_path(path)
    return images
Exemple #22
0
 def format_template(template, page_num, height):
     template = template.replace('_TOP_LEVEL_SECTION_PAGES_',
                                 str(toplevel_pagenum_map[page_num - 1]))
     template = template.replace('_TOP_LEVEL_SECTION_PAGENUM_',
                                 str(toplevel_pages_map[page_num - 1]))
     template = template.replace('_TOTAL_PAGES_', str(pages_in_doc))
     template = template.replace('_PAGENUM_',
                                 str(page_number_display_map[page_num]))
     template = template.replace(
         '_TITLE_', prepare_string_for_xml(pdf_metadata.title, True))
     template = template.replace(
         '_AUTHOR_', prepare_string_for_xml(pdf_metadata.author, True))
     template = template.replace(
         '_TOP_LEVEL_SECTION_',
         prepare_string_for_xml(toplevel_toc_map[page_num - 1]))
     template = template.replace(
         '_SECTION_', prepare_string_for_xml(page_toc_map[page_num - 1]))
     troot = parse(template, namespace_elements=True)
     ans = last_tag(troot)[0]
     style = ans.get('style') or ''
     style = (
         'margin: 0; padding: 0; height: {height}pt; border-width: 0;'
         'display: flex; align-items: center; overflow: hidden; background-color: unset;'
     ).format(height=height) + style
     ans.set('style', style)
     for child in ans.xpath('descendant-or-self::*[@class]'):
         cls = frozenset(child.get('class').split())
         q = 'even-page' if page_num % 2 else 'odd-page'
         if q in cls or q.replace('-', '_') in cls:
             style = child.get('style') or ''
             child.set('style', style + '; display: none')
     return ans
Exemple #23
0
def html_to_lxml(raw):
    raw = '<div>%s</div>' % raw
    root = parse(raw,
                 keep_doctype=False,
                 namespace_elements=False,
                 maybe_xhtml=False,
                 sanitize_names=True)
    root = next(root.iterdescendants('div'))
    root.set('xmlns', "http://www.w3.org/1999/xhtml")
    raw = etree.tostring(root, encoding='unicode')
    try:
        return safe_xml_fromstring(raw, recover=False)
    except:
        for x in root.iterdescendants():
            remove = []
            for attr in x.attrib:
                if ':' in attr:
                    remove.append(attr)
            for a in remove:
                del x.attrib[a]
        raw = etree.tostring(root, encoding='unicode')
        try:
            return safe_xml_fromstring(raw, recover=False)
        except:
            from calibre.ebooks.oeb.parse_utils import _html4_parse
            return _html4_parse(raw)
Exemple #24
0
    def search(self, query, page=None, detailed=False):
        """Sends a POST request and retrieves a list of applications matching
        the query term(s).

        :param query: search query term(s) to retrieve matching apps
        :param page: the page number to retrieve. Max is 12.
        :param detailed: if True, sends request per app for its full detail
        :return: a list of apps matching search terms
        """
        page = 0 if page is None else page
        if page > len(self._pagtok) - 1:
            raise ValueError(
                'Page out of range. Please choose a number between 0 - 12')

        pagtok = self._pagtok[page]
        data = generate_post_data(0, 0, pagtok)

        params = {'q': quote_plus(query), 'c': 'apps'}

        response = send_request('POST', self._search_url, data, params)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [
                self._parse_card_info(app)
                for app in soup.select('div[data-uitype=500]')
            ]

        return apps
Exemple #25
0
    def parse_html(self, file, encoding=None, parser_type='html'):
        '''Return an iterator of elements found in the document.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.
            parser_type (str): The type of parser to use. Accepted values:
                ``html``, ``xhtml``, ``xml``.

        Returns:
            iterator: Each item is an element from
            :mod:`.document.htmlparse.element`
        '''
        if parser_type == 'html' or parser_type == 'xhtml':
            content = file.read()
            # Don't use maybe_xhtml=True for xhtml because it segfaults:
            # https://github.com/ludios/wpull/issues/15
            tree = html5_parser.parse(content, transport_encoding=encoding)
        else:
            if encoding:
                lxml_encoding = to_lxml_encoding(encoding) or 'latin1'
            else:
                lxml_encoding = encoding

            parser = lxml.etree.XMLParser(encoding=lxml_encoding, recover=True)
            tree = lxml.etree.parse(file, parser=parser)

        for element in tree.getiterator():
            yield element
Exemple #26
0
    def similar(self, app_id, results=None, detailed=False):
        """Sends a POST request and retrieves a list of applications similar to
        the specified app.

        :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa'
        :param results: the number of apps to retrieve at a time.
        :param detailed: if True, sends request per app for its full detail
        :return: a list of similar apps
        """
        results = s.SIMILAR_RESULTS if results is None else results

        url = build_url('similar', app_id)
        data = generate_post_data(results)
        response = send_request('POST', url, data)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [
                self._parse_card_info(app)
                for app in soup.select('div[data-uitype=500]')
            ]

        return apps
Exemple #27
0
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {'english':'eng', 'french':'fra', 'german':'deu',
                    'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
Exemple #28
0
def _get_text(html):
    "Extract textual content from HTML."
    soup = html5_parser.parse(html, treebuilder='soup', return_root=False)
    [
        element.extract() for element in soup.find_all(
            string=lambda text: isinstance(text, Comment))
    ]
    return soup.find_all(text=True)
Exemple #29
0
 def create_doc(cls, data, **kwargs):
     if isinstance(data, str):
         doc = parse(data)
     elif isinstance(data, (dict, list)):
         doc = object_to_dom("root", data)
     elif not isinstance(data, parsel.Selector):
         raise DocumentTypeNotSupportedForQuerier()
     return doc
Exemple #30
0
    def test_xlink(self):
        src = '''<html xmlns:xlink="xl"><svg><image xlink:href="x"/>'''
        root = parse(src, maybe_xhtml=True)
        self.ae(
            tostring(root),
            '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="xl"><head/><body>\
<svg xmlns="http://www.w3.org/2000/svg"><image xlink:href="x"/></svg></body></html>'''
        )
Exemple #31
0
def pagure_html_diff_changed(html_string):
    parsed = html5_parser.parse(str(html_string))
    elements = parsed.xpath("//section[contains(@class, 'commit_diff')]"
                            "//div[contains(@class, 'card-header')]"
                            "//a[contains(@class, 'font-weight-bold')]"
                            "/text()")

    return set([str(x) for x in elements])
Exemple #32
0
def parse_html(raw):
    try:
        from html5_parser import parse
    except ImportError:
        # Old versions of calibre
        import html5lib
        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    else:
        return parse(raw)
Exemple #33
0
def _parse_page_html5(content: str) -> etree._Element:
    """
    Internal function to parse a page and return the data.

    This uses html5_parser.
    """
    if content and content.lower() != 'none':
        data = parse(content)
        return data
def parse_html(raw):
    try:
        from html5_parser import parse
    except ImportError:
        # Old versions of calibre
        import html5lib
        return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False)
    else:
        return parse(raw)
Exemple #35
0
 def test_line_numbers(self):
     root = parse('<html>\n<head>\n<body>\n<p><span>', line_number_attr='ln')
     self.ae(root.sourceline, 1)
     self.ae(int(root.get('ln')), 1)
     self.ae(root[0].sourceline, 2)
     self.ae(root[1].sourceline, 3)
     self.ae(root[1][0].sourceline, 4)
     self.ae(root[1][0][0].sourceline, 4)
     self.ae(root[1][0][0].get('ln'), '4')
Exemple #36
0
def links_diff(a_text, b_text, a_headers=None, b_headers=None,
               content_type_options='normal'):
    """
    Extracts all the outgoing links from a page and produces a diff of an
    HTML document that is simply a list of the text and URL of those links.

    It ignores links that merely navigate within the page.

    NOTE: this diff currently suffers from the fact that our diff server does
    not know the original URL of the content, so it can identify:

    >>> <a href="#anchor-in-this-page">Text</a>

    as an internal link, but not:

    >>> <a href="http://this.domain.com/this/page#anchor-in-this-page">Text</a>
    """
    raise_if_not_diffable_html(
        a_text,
        b_text,
        a_headers,
        b_headers,
        content_type_options)

    a_soup = html5_parser.parse(a_text, treebuilder='soup', return_root=False)
    b_soup = html5_parser.parse(b_text, treebuilder='soup', return_root=False)

    a_links = sorted(
        set([Link.from_element(element) for element in _find_outgoing_links(a_soup)]),
        key=lambda link: link.text.lower() + f'({link.href})')
    b_links = sorted(
        set([Link.from_element(element) for element in _find_outgoing_links(b_soup)]),
        key=lambda link: link.text.lower() + f'({link.href})')

    matcher = SequenceMatcher(a=a_links, b=b_links)
    opcodes = matcher.get_opcodes()
    diff = list(_assemble_diff(a_links, b_links, opcodes))

    return {
        'change_count': _count_changes(diff),
        'diff': diff,
        'a_parsed': a_soup,
        'b_parsed': b_soup
    }
Exemple #37
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Exemple #38
0
def html5_parse(data, max_nesting_depth=100):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    # Check that the asinine HTML 5 algorithm did not result in a tree with
    # insane nesting depths
    for x in data.iterdescendants():
        if isinstance(x.tag, basestring) and len(x) is 0:  # Leaf node
            depth = node_depth(x)
            if depth > max_nesting_depth:
                raise ValueError('HTML 5 parsing resulted in a tree with nesting'
                        ' depth > %d'%max_nesting_depth)
    return data
Exemple #39
0
    def details(self, app_id):
        """Sends a GET request and parses an application's details.

        :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa'
        :return: a dictionary of app details
        """
        url = build_url('details', app_id)

        try:
            response = send_request('GET', url)
            soup = html5_parser.parse(response.text, treebuilder='soup')
        except requests.exceptions.HTTPError as e:
            raise ValueError('Invalid application ID: {app}. {error}'.format(app=app_id, error=e))

        return self._parse_app_details(soup)
Exemple #40
0
def parse_html_toc(data):
    from html5_parser import parse
    from calibre.utils.cleantext import clean_xml_chars
    from lxml import etree
    if isinstance(data, bytes):
        data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
    root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
    for a in root.xpath('//*[@href and local-name()="a"]'):
        purl = urlparse(unquote(a.get('href')))
        href, fragment = purl[2], purl[5]
        if not fragment:
            fragment = None
        else:
            fragment = fragment.strip()
        href = href.strip()

        txt = etree.tostring(a, method='text', encoding='unicode')
        yield href, fragment, txt
Exemple #41
0
 def fix_resources(self, html, base_dir):
     from html5_parser import parse
     root = parse(html)
     changed = False
     for img in root.xpath('//img[@src]'):
         src = img.get('src')
         prefix = src.split(':', 1)[0].lower()
         if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
             src = os.path.join(base_dir, src)
             if os.access(src, os.R_OK):
                 with open(src, 'rb') as f:
                     data = f.read()
                 f = self.shift_file(base_dir, os.path.basename(src), data)
                 changed = True
                 img.set('src', os.path.basename(f))
     if changed:
         from lxml import etree
         html = etree.tostring(root, encoding='unicode')
     return html
Exemple #42
0
    def similar(self, app_id, results=None, detailed=False):
        """Sends a POST request and retrieves a list of applications similar to
        the specified app.

        :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa'
        :param results: the number of apps to retrieve at a time.
        :param detailed: if True, sends request per app for its full detail
        :return: a list of similar apps
        """
        results = s.SIMILAR_RESULTS if results is None else results

        url = build_url('similar', app_id)
        data = generate_post_data(results)
        response = send_request('POST', url, data)
        soup = html5_parser.parse(response.text, treebuilder='soup')

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')]

        return apps
Exemple #43
0
    def html(self):
        ans = u''
        try:
            if not self.page().mainFrame().documentElement().findFirst('meta[name="calibre-dont-sanitize"]').isNull():
                # Bypass cleanup if special meta tag exists
                return unicode_type(self.page().mainFrame().toHtml())
            check = unicode_type(self.page().mainFrame().toPlainText()).strip()
            raw = unicode_type(self.page().mainFrame().toHtml())
            raw = xml_to_unicode(raw, strip_encoding_pats=True,
                                resolve_entities=True)[0]
            raw = self.comments_pat.sub('', raw)
            if not check and '<img' not in raw.lower():
                return ans

            try:
                root = html.fromstring(raw)
            except Exception:
                root = parse(raw, maybe_xhtml=False, sanitize_names=True)

            elems = []
            for body in root.xpath('//body'):
                if body.text:
                    elems.append(body.text)
                elems += [html.tostring(x, encoding='unicode') for x in body if
                    x.tag not in ('script', 'style')]

            if len(elems) > 1:
                ans = u'<div>%s</div>'%(u''.join(elems))
            else:
                ans = u''.join(elems)
                if not ans.startswith('<'):
                    ans = '<p>%s</p>'%ans
            ans = xml_replace_entities(ans)
        except:
            import traceback
            traceback.print_exc()

        return ans
Exemple #44
0
    def collection(self, collection, category=None, results=None, page=None, age=None, detailed=False):
        """Sends a POST request and fetches a list of applications belonging to
        the collection and an optional category.

        :param collection: the collection id, e.g. 'NEW_FREE'.
        :param category: (optional) the category name, e.g. 'GAME_ACTION'.
        :param results: the number of apps to retrieve at a time.
        :param page: page number to retrieve; limitation: page * results <= 500.
        :param age: an age range to filter by (only for FAMILY categories)
        :param detailed: if True, sends request per app for its full detail
        :return: a list of app dictionaries
        """
        collection = self.collections[collection]
        category = '' if category is None else self.categories[category]

        results = s.NUM_RESULTS if results is None else results
        if results > 120:
            raise ValueError('Number of results cannot be more than 120.')

        page = 0 if page is None else page
        if page * results > 500:
            raise ValueError('Start (page * results) cannot be greater than 500.')

        params = {}
        if category.startswith('FAMILY') and age is not None:
            params['age'] = self.age[age]

        url = build_collection_url(category, collection)
        data = generate_post_data(results, page)
        response = send_request('POST', url, data, params)

        if detailed:
            apps = self._parse_multiple_apps(response)
        else:
            soup = html5_parser.parse(response.text, treebuilder='soup')
            apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')]

        return apps
Exemple #45
0
from html5_parser import parse
from lxml.etree import tostring
root = parse(some_html)
print(tostring(root))
Exemple #46
0
    def extract_content(self, output_dir, parse_cache):
        output_dir = os.path.abspath(output_dir)
        self.check_for_drm()
        processed_records = self.extract_text()
        if self.debug is not None:
            parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
        self.add_anchors()
        self.processed_html = self.processed_html.decode(self.book_header.codec,
            'ignore')
        self.processed_html = self.processed_html.replace('</</', '</')
        self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
                self.processed_html)
        self.processed_html = self.processed_html.replace(u'\ufeff', '')
        # Remove tags of the form <xyz: ...> as they can cause issues further
        # along the pipeline
        self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
                self.processed_html)

        self.processed_html = strip_encoding_declarations(self.processed_html)
        self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
            self.processed_html)
        self.extract_images(processed_records, output_dir)
        self.replace_page_breaks()
        self.cleanup_html()

        self.log.debug('Parsing HTML...')
        self.processed_html = clean_xml_chars(self.processed_html)
        try:
            root = html.fromstring(self.processed_html)
            if len(root.xpath('//html')) > 5:
                root = html.fromstring(self.processed_html.replace('\x0c',
                    '').replace('\x14', ''))
        except Exception:
            self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
            self.processed_html = self.remove_random_bytes(self.processed_html)
            root = html.fromstring(self.processed_html)
        if root.xpath('descendant::p/descendant::p'):
            from html5_parser import parse
            self.log.warning('Malformed markup, parsing using html5-parser')
            self.processed_html = strip_encoding_declarations(self.processed_html)
            try:
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            except Exception:
                self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
                self.processed_html = self.remove_random_bytes(self.processed_html)
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
            if len(root.xpath('body/descendant::*')) < 1:
                # There are probably stray </html>s in the markup
                self.processed_html = self.processed_html.replace('</html>',
                        '')
                root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)

        if root.tag != 'html':
            self.log.warn('File does not have opening <html> tag')
            nroot = html.fromstring('<html><head></head><body></body></html>')
            bod = nroot.find('body')
            for child in list(root):
                child.getparent().remove(child)
                bod.append(child)
            root = nroot

        htmls = list(root.xpath('//html'))

        if len(htmls) > 1:
            self.log.warn('Markup contains multiple <html> tags, merging.')
            # Merge all <head> and <body> sections
            for h in htmls:
                p = h.getparent()
                if hasattr(p, 'remove'):
                    p.remove(h)
            bodies, heads = root.xpath('//body'), root.xpath('//head')
            for x in root:
                root.remove(x)
            head, body = map(root.makeelement, ('head', 'body'))
            for h in heads:
                for x in h:
                    h.remove(x)
                    head.append(x)
            for b in bodies:
                for x in b:
                    b.remove(x)
                    body.append(x)
            root.append(head), root.append(body)
        for x in root.xpath('//script'):
            x.getparent().remove(x)

        head = root.xpath('//head')
        if head:
            head = head[0]
        else:
            head = root.makeelement('head', {})
            root.insert(0, head)
        head.text = '\n\t'
        link = head.makeelement('link', {'type':'text/css',
            'href':'styles.css', 'rel':'stylesheet'})
        head.insert(0, link)
        link.tail = '\n\t'
        title = head.xpath('descendant::title')
        m = head.makeelement('meta', {'http-equiv':'Content-Type',
            'content':'text/html; charset=utf-8'})
        head.insert(0, m)
        if not title:
            title = head.makeelement('title', {})
            try:
                title.text = self.book_header.title
            except ValueError:
                title.text = clean_ascii_chars(self.book_header.title)
            title.tail = '\n\t'
            head.insert(0, title)
            head.text = '\n\t'

        self.upshift_markup(root)
        guides = root.xpath('//guide')
        guide = guides[0] if guides else None
        metadata_elems = root.xpath('//metadata')
        if metadata_elems and self.book_header.exth is None:
            self.read_embedded_metadata(root, metadata_elems[0], guide)
        for elem in guides + metadata_elems:
            elem.getparent().remove(elem)
        htmlfile = os.path.join(output_dir, 'index.html')
        try:
            for ref in guide.xpath('descendant::reference'):
                if 'href' in ref.attrib:
                    ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
        except AttributeError:
            pass

        def write_as_utf8(path, data):
            if isinstance(data, unicode_type):
                data = data.encode('utf-8')
            with lopen(path, 'wb') as f:
                f.write(data)

        parse_cache[htmlfile] = root
        self.htmlfile = htmlfile
        ncx = io.BytesIO()
        opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
        self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
        opf.render(lopen(self.created_opf_path, 'wb'), ncx,
            ncx_manifest_entry=ncx_manifest_entry)
        ncx = ncx.getvalue()
        if ncx:
            ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
            write_as_utf8(ncx_path, ncx)

        css = [self.base_css_rules, '\n\n']
        for cls, rule in self.tag_css_rules.items():
            css.append('.%s { %s }\n\n' % (cls, rule))
        write_as_utf8('styles.css', ''.join(css))

        if self.book_header.exth is not None or self.embedded_mi is not None:
            self.log.debug('Creating OPF...')
            ncx = io.BytesIO()
            opf, ncx_manifest_entry  = self.create_opf(htmlfile, guide, root)
            opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
                ncx_manifest_entry)
            ncx = ncx.getvalue()
            if ncx:
                write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
Exemple #47
0
def parse_html(raw):
    raw = clean_html(raw)
    from html5_parser import parse
    return parse(raw)