Example #1
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Example #2
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Example #3
0
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = handle_private_entities(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
    if force_html5_parse:
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
        ans = safe_xml_fromstring(raw, recover=False)
        if ans.tag != '{%s}html' % XHTML_NS:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
            for elem in ans.iter(LxmlElement):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
Example #4
0
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw)
    try:
        parser = XMLParser(no_network=True)
        ans = fromstring(raw, parser=parser)
        if linenumber_attribute:
            for elem in ans.iter(LxmlElement):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
Example #5
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = fix_self_closing_cdata_tags(raw)  # TODO: Handle this in the parser
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)

    stream_class = partial(FastStream, track_position=line_numbers)
    stream = stream_class(raw)
    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
    while True:
        try:
            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=DataLossWarning)
                try:
                    parser.parse(stream, parseMeta=False, useChardet=False)
                finally:
                    parser.tree.proxy_cache = None
        except NamespacedHTMLPresent as err:
            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
            stream = stream_class(raw)
            continue
        break
    root = parser.tree.getDocument()
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #6
0
def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(
        xml_to_unicode(raw, strip_encoding_pats=True,
                       resolve_entities=True)[0])

    title = rdr.title
    try:
        x = rdr.GetEncoding()
        codecs.lookup(x)
        enc = x
    except:
        enc = 'cp1252'
    title = force_unicode(title, enc)
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments

    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)

    return mi
Example #7
0
    def search(self, query, max_results=10, timeout=60):
        search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\
        'search=%s&limit=0,%s'
        search_url = search_url % (urllib2.quote(query), max_results)

        counter = max_results
        br = browser()
        br.addheaders.append(['Accept-Encoding', 'gzip'])

        with closing(br.open(search_url, timeout=timeout)) as r:
            ungzipResponse(r, br)
            raw = xml_to_unicode(r.read(),
                                 strip_encoding_pats=True,
                                 assume_utf8=True)[0]

            parser = etree.XMLParser(recover=True, no_network=True)
            doc = etree.fromstring(raw, parser=parser)
            for data in doc.xpath('//*[local-name() = "fb2-book"]'):
                if counter <= 0:
                    break
                counter -= 1

                try:
                    sRes = self.create_search_result(data)
                except Exception as e:
                    prints('ERROR: cannot parse search result #%s: %s' %
                           (max_results - counter + 1, e))
                    continue
                yield sRes
Example #8
0
    def search(self, query, max_results=15, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
        search_urls = [ search_url ]
        
        ## add this as the fist try if it looks like ozon ID
        if re.match("^\d{6,9}$", query):
            ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
            search_urls.insert(0, ozon_detail)

        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()
        
        for url in search_urls:
            with closing(br.open(url, timeout=timeout)) as f:
                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
                doc = etree.fromstring(raw)
                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
                    if counter <= 0:
                        break
                    counter -= 1

                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format('ID'))
                    s.title = data.xpath(xp_template.format('Name'))
                    s.author = data.xpath(xp_template.format('Author'))
                    s.price = data.xpath(xp_template.format('Price'))
                    s.cover_url = data.xpath(xp_template.format('Picture'))
                    s.price = format_price_in_RUR(s.price)
                    yield s
Example #9
0
def get_decoded_raw(name):
    from calibre.ebooks.chardet import xml_to_unicode, force_encoding
    with open(name, 'rb') as f:
        raw = f.read()
    syntax = syntax_from_mime(name, guess_type(name))
    if syntax is None:
        try:
            raw = raw.decode('utf-8')
        except ValueError:
            pass
    elif syntax != 'raster_image':
        if syntax in {'html', 'xml'}:
            raw = xml_to_unicode(raw, verbose=True)[0]
        else:
            m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
            if m is not None and m.group(1) != '8bit':
                enc = m.group(1)
                if enc == b'unicode':
                    enc = 'utf-8'
            else:
                enc = force_encoding(raw, verbose=True)
            try:
                raw = raw.decode(enc)
            except (LookupError, ValueError):
                try:
                    raw = raw.decode('utf-8')
                except ValueError:
                    pass
    return raw, syntax
Example #10
0
    def html(self):
        raw = original_html = self.toHtml()
        check = self.toPlainText().strip()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
        raw = self.comments_pat.sub('', raw)
        if not check and '<img' not in raw.lower():
            return ''

        root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        if root.xpath('//meta[@name="calibre-dont-sanitize"]'):
            # Bypass cleanup if special meta tag exists
            return original_html

        try:
            cleanup_qt_markup(root)
        except Exception:
            import traceback
            traceback.print_exc()
        elems = []
        for body in root.xpath('//body'):
            if body.text:
                elems.append(body.text)
            elems += [html.tostring(x, encoding='unicode') for x in body if
                x.tag not in ('script', 'style')]

        if len(elems) > 1:
            ans = '<div>%s</div>'%(u''.join(elems))
        else:
            ans = ''.join(elems)
            if not ans.startswith('<'):
                ans = '<p>%s</p>'%ans
        return xml_replace_entities(ans)
Example #11
0
 def decode(self, data):
     """Automatically decode :param:`data` into a `unicode` object."""
     def fix_data(d):
         return d.replace('\r\n', '\n').replace('\r', '\n')
     if isinstance(data, unicode):
         return fix_data(data)
     bom_enc = None
     if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}:
         bom_enc = {b'\0\0\xfe\xff':'utf-32-be',
                    b'\xff\xfe\0\0':'utf-32-le'}[data[:4]]
         data = data[4:]
     elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}:
         bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]]
         data = data[2:]
     elif data[:3] == b'\xef\xbb\xbf':
         bom_enc = 'utf-8'
         data = data[3:]
     if bom_enc is not None:
         try:
             self.used_encoding = bom_enc
             return fix_data(data.decode(bom_enc))
         except UnicodeDecodeError:
             pass
     try:
         self.used_encoding = 'utf-8'
         return fix_data(data.decode('utf-8'))
     except UnicodeDecodeError:
         pass
     data, self.used_encoding = xml_to_unicode(data)
     return fix_data(data)
Example #12
0
def search(query, max_results=15, timeout=60):
    url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % quote_plus(
        query)

    counter = max_results
    br = browser()

    with closing(br.open(url, timeout=timeout)) as f:
        raw = xml_to_unicode(f.read(),
                             strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse_html(raw)
        for tile in root.xpath('//*[@class="bShelfTile inline"]'):
            if counter <= 0:
                break
            counter -= 1

            s = SearchResult(store_name='OZON.ru')
            s.detail_item = shop_url + tile.xpath(
                'descendant::a[@class="eShelfTile_Link"]/@href')[0]
            s.title = tile.xpath(
                'descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0]
            s.author = tile.xpath(
                'descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0]
            s.price = ''.join(
                tile.xpath(
                    'descendant::div[contains(@class, "eShelfTile_Price")]/text()'
                ))
            s.cover_url = 'http:' + tile.xpath(
                'descendant::img/@data-original')[0]
            s.price = format_price_in_RUR(s.price)
            yield s
Example #13
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = fix_self_closing_cdata_tags(raw)  # TODO: Handle this in the parser
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)

    stream_class = partial(FastStream, track_position=line_numbers)
    stream = stream_class(raw)
    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
    while True:
        try:
            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=DataLossWarning)
                try:
                    parser.parse(stream, parseMeta=False, useChardet=False)
                finally:
                    parser.tree.proxy_cache = None
        except NamespacedHTMLPresent as err:
            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
            stream = stream_class(raw)
            continue
        break
    root = parser.tree.getDocument()
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #14
0
    def decode(self, data):
        """Automatically decode :param:`data` into a `unicode` object."""

        def fix_data(d):
            return d.replace("\r\n", "\n").replace("\r", "\n")

        if isinstance(data, unicode):
            return fix_data(data)
        bom_enc = None
        if data[:4] in {b"\0\0\xfe\xff", b"\xff\xfe\0\0"}:
            bom_enc = {b"\0\0\xfe\xff": "utf-32-be", b"\xff\xfe\0\0": "utf-32-le"}[data[:4]]
            data = data[4:]
        elif data[:2] in {b"\xff\xfe", b"\xfe\xff"}:
            bom_enc = {b"\xff\xfe": "utf-16-le", b"\xfe\xff": "utf-16-be"}[data[:2]]
            data = data[2:]
        elif data[:3] == b"\xef\xbb\xbf":
            bom_enc = "utf-8"
            data = data[3:]
        if bom_enc is not None:
            try:
                self.used_encoding = bom_enc
                return fix_data(data.decode(bom_enc))
            except UnicodeDecodeError:
                pass
        try:
            self.used_encoding = "utf-8"
            return fix_data(data.decode("utf-8"))
        except UnicodeDecodeError:
            pass
        data, self.used_encoding = xml_to_unicode(data)
        return fix_data(data)
Example #15
0
    def search(self, query, max_results=15, timeout=60):
        search_url = (
            self.shop_url + "/webservice/webservice.asmx/SearchWebService?"
            "searchText=%s&searchContext=ebook" % urllib2.quote(query)
        )
        search_urls = [search_url]

        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()

        for url in search_urls:
            with closing(br.open(url, timeout=timeout)) as f:
                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
                doc = etree.fromstring(raw)
                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
                    if counter <= 0:
                        break
                    counter -= 1

                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format("ID"))
                    s.title = data.xpath(xp_template.format("Name"))
                    s.author = data.xpath(xp_template.format("Author"))
                    s.price = data.xpath(xp_template.format("Price"))
                    s.cover_url = data.xpath(xp_template.format("Picture"))
                    s.price = format_price_in_RUR(s.price)
                    yield s
Example #16
0
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw).replace('\0', '')  # Handle &#0;
    raw = raw.replace('\r\n', '\n').replace('\r', '\n')

    # Remove any preamble before the opening html tag as it can cause problems,
    # especially doctypes, preserve the original linenumbers by inserting
    # newlines at the start
    pre = raw[:2048]
    for match in re.finditer(r'<\s*html', pre, flags=re.I):
        newlines = raw.count('\n', 0, match.start())
        raw = ('\n' * newlines) + raw[match.start():]
        break

    raw = strip_encoding_declarations(raw)
    if force_html5_parse:
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
    try:
        parser = XMLParser(no_network=True)
        ans = fromstring(raw, parser=parser)
        if ans.tag != '{%s}html' % html_ns:
            raise ValueError('Root tag is not <html> in the XHTML namespace')
        if linenumber_attribute:
            for elem in ans.iter(LxmlElement):
                if elem.sourceline is not None:
                    elem.set(linenumber_attribute, str(elem.sourceline))
        return ans
    except Exception:
        if log is not None:
            log.exception('Failed to parse as XML, parsing as tag soup')
        return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
Example #17
0
def get_decoded_raw(name):
    from calibre.ebooks.chardet import xml_to_unicode, force_encoding
    with open(name, 'rb') as f:
        raw = f.read()
    syntax = syntax_from_mime(name, guess_type(name))
    if syntax is None:
        try:
            raw = raw.decode('utf-8')
        except ValueError:
            pass
    elif syntax != 'raster_image':
        if syntax in {'html', 'xml'}:
            raw = xml_to_unicode(raw, verbose=True)[0]
        else:
            m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I)
            if m is not None and m.group(1) != '8bit':
                enc = m.group(1)
                if enc == b'unicode':
                    enc = 'utf-8'
            else:
                enc = force_encoding(raw, verbose=True)
            try:
                raw = raw.decode(enc)
            except (LookupError, ValueError):
                try:
                    raw = raw.decode('utf-8')
                except ValueError:
                    pass
    return raw, syntax
Example #18
0
    def make_query(self,
                   q,
                   abort,
                   title=None,
                   authors=None,
                   identifiers={},
                   max_pages=10,
                   timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        page_num = 1
        parser = etree.XMLParser(recover=True, no_network=True)
        br = self.browser

        seen = set()

        candidates = []
        total_found = 0
        while page_num <= max_pages and not abort.is_set():
            url = q.replace('&page_number=1&', '&page_number=%d&' % page_num)
            page_num += 1
            raw = br.open_novisit(url, timeout=timeout).read()
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            total, found, results = self.parse_feed(feed, seen, title, authors,
                                                    identifiers)
            total_found += found
            candidates += results
            if total_found >= total or len(candidates) > 9:
                break

        return candidates
Example #19
0
    def search(self, query, max_results=15, timeout=60):
        search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\
                    'searchText=%s&searchContext=ebook' % urllib2.quote(query)
        search_urls = [ search_url ]

        ## add this as the fist try if it looks like ozon ID
        if re.match("^\d{6,9}$", query):
            ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query
            search_urls.insert(0, ozon_detail)

        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'
        counter = max_results
        br = browser()

        for url in search_urls:
            with closing(br.open(url, timeout=timeout)) as f:
                raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0]
                doc = etree.fromstring(raw)
                for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'):
                    if counter <= 0:
                        break
                    counter -= 1

                    s = SearchResult()
                    s.detail_item = data.xpath(xp_template.format('ID'))
                    s.title = data.xpath(xp_template.format('Name'))
                    s.author = data.xpath(xp_template.format('Author'))
                    s.price = data.xpath(xp_template.format('Price'))
                    s.cover_url = data.xpath(xp_template.format('Picture'))
                    s.price = format_price_in_RUR(s.price)
                    yield s
Example #20
0
def parse_html(raw):
    import html5lib
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars

    raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0])
    return html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False).getroot()
Example #21
0
    def make_query(self, q, abort, title=None, authors=None, identifiers={},
            max_pages=10, timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        page_num = 1
        parser = etree.XMLParser(recover=True, no_network=True)
        br = self.browser

        seen = set()

        candidates = []
        total_found = 0
        while page_num <= max_pages and not abort.is_set():
            url = q.replace('&page_number=1&', '&page_number=%d&'%page_num)
            page_num += 1
            raw = br.open_novisit(url, timeout=timeout).read()
            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
                strip_encoding_pats=True)[0], parser=parser)
            total, found, results = self.parse_feed(
                    feed, seen, title, authors, identifiers)
            total_found += found
            candidates += results
            if total_found >= total or len(candidates) > 9:
                break

        return candidates
Example #22
0
    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=60):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode

        if not self.is_configured():
            return
        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        if not query:
            err = u'Insufficient metadata to construct query'
            log.error(err)
            return err

        try:
            raw = self.browser.open_novisit(query).read()

        except Exception as e:
            log.exception(u'Failed to make identify query: %r'%query)
            return as_unicode(e)

        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser)
            entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
            if entries:
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
Example #23
0
    def search(self, query, max_results=10, timeout=60):
        search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\
        'search=%s&limit=0,%s'
        search_url = search_url % (urllib2.quote(query), max_results)

        counter = max_results
        br = browser()
        br.addheaders.append(['Accept-Encoding','gzip'])

        with closing(br.open(search_url, timeout=timeout)) as r:
            ungzipResponse(r,br)
            raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0]

            parser = etree.XMLParser(recover=True, no_network=True)
            doc = etree.fromstring(raw, parser=parser)
            for data in doc.xpath('//*[local-name() = "fb2-book"]'):
                if counter <= 0:
                    break
                counter -= 1

                try:
                    sRes = self.create_search_result(data)
                except Exception as e:
                    prints('ERROR: cannot parse search result #%s: %s'%(max_results - counter + 1, e))
                    continue
                yield sRes
Example #24
0
def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True,
        resolve_entities=True)[0])

    title = rdr.title
    try:
        x = rdr.GetEncoding()
        codecs.lookup(x)
        enc = x
    except:
        enc = 'cp1252'
    title = force_unicode(title, enc)
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments

    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)

    return mi
Example #25
0
 def _parse(self, raw, mimetype):
     mt = mimetype.lower()
     if mt.endswith('+xml'):
         parser = etree.XMLParser(no_network=True, huge_tree=not iswindows)
         raw = xml_to_unicode(raw,
             strip_encoding_pats=True, assume_utf8=True,
             resolve_entities=True)[0].strip()
         idx = raw.find('<html')
         if idx == -1:
             idx = raw.find('<HTML')
         if idx > -1:
             pre = raw[:idx]
             raw = raw[idx:]
             if '<!DOCTYPE' in pre:
                 user_entities = {}
                 for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
                     val = match.group(2)
                     if val.startswith('"') and val.endswith('"'):
                         val = val[1:-1]
                     user_entities[match.group(1)] = val
                 if user_entities:
                     pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
                     raw = pat.sub(lambda m:user_entities[m.group(1)], raw)
         return etree.fromstring(raw, parser=parser)
     return raw
Example #26
0
        def fget(self):
            ans = u''
            try:
                check = unicode(self.page().mainFrame().toPlainText()).strip()
                raw = unicode(self.page().mainFrame().toHtml())
                raw = xml_to_unicode(raw, strip_encoding_pats=True,
                                    resolve_entities=True)[0]
                raw = self.comments_pat.sub('', raw)
                if not check and '<img' not in raw.lower():
                    return ans

                try:
                    root = html.fromstring(raw)
                except:
                    root = fromstring(raw)

                elems = []
                for body in root.xpath('//body'):
                    if body.text:
                        elems.append(body.text)
                    elems += [html.tostring(x, encoding=unicode) for x in body if
                        x.tag not in ('script', 'style')]

                if len(elems) > 1:
                    ans = u'<div>%s</div>'%(u''.join(elems))
                else:
                    ans = u''.join(elems)
                    if not ans.startswith('<'):
                        ans = '<p>%s</p>'%ans
                ans = xml_replace_entities(ans)
            except:
                import traceback
                traceback.print_exc()

            return ans
Example #27
0
def parse_outline(raw, output_dir):
    from lxml import etree
    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
    raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0])
    outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]')
    if outline:
        from calibre.ebooks.oeb.polish.toc import TOC, create_ncx
        outline = outline[0]
        toc = TOC()
        count = [0]

        def process_node(node, toc):
            for child in node.iterdescendants('*'):
                if child.tag == 'outline':
                    parent = toc.children[-1] if toc.children else toc
                    process_node(child, parent)
                else:
                    page = child.get('page', '1')
                    toc.add(child.text, 'index.html', page)
                    count[0] += 1
        process_node(outline, toc)
        if count[0] > 2:
            root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml')
            with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
                f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
Example #28
0
 def postprocess_book(self, oeb, opts, log):
     from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
     for item in oeb.spine:
         root = item.data
         if not hasattr(root, 'xpath'): continue
         for bad in ('metadata', 'guide'):
             metadata = XPath('//h:'+bad)(root)
             if metadata:
                 for x in metadata:
                     x.getparent().remove(x)
         body = XPath('//h:body')(root)
         if body:
             body = body[0]
             if len(body) == 1 and body[0].tag == XHTML('pre'):
                 pre = body[0]
                 from calibre.ebooks.txt.processor import convert_basic, \
                     separate_paragraphs_single_line
                 from calibre.ebooks.chardet import xml_to_unicode
                 from lxml import etree
                 import copy
                 html = separate_paragraphs_single_line(pre.text)
                 html = convert_basic(html).replace('<html>',
                         '<html xmlns="%s">'%XHTML_NS)
                 html = xml_to_unicode(html, strip_encoding_pats=True,
                         resolve_entities=True)[0]
                 root = etree.fromstring(html)
                 body = XPath('//h:body')(root)
                 pre.tag = XHTML('div')
                 pre.text = ''
                 for elem in body:
                     ne = copy.deepcopy(elem)
                     pre.append(ne)
Example #29
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class': 'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class': 'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
Example #30
0
    def identify(  # {{{
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,
            identifiers={},
            timeout=30):
        from lxml import etree
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        log('Making query:', query)
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

        if not entries and title and not abort.is_set():
            if identifiers:
                log('No results found, retrying without identifiers')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=authors,
                                     timeout=timeout)
            ntitle = cleanup_title(title)
            if ntitle and ntitle != title:
                log('No results found, retrying without sub-title')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=ntitle,
                                     authors=authors,
                                     timeout=timeout)

        # There is no point running these queries in threads as google
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)
Example #31
0
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {'english':'eng', 'french':'fra', 'german':'deu',
                    'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
Example #32
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class':'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class':'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
Example #33
0
    def read_ncx_toc(self, toc, root=None):
        self.base_path = os.path.dirname(toc)
        if root is None:
            with open(toc, 'rb') as f:
                raw = xml_to_unicode(f.read(),
                                     assume_utf8=True,
                                     strip_encoding_pats=True)[0]
            root = etree.fromstring(raw,
                                    parser=etree.XMLParser(recover=True,
                                                           no_network=True))
        xpn = {'re': 'http://exslt.org/regular-expressions'}
        XPath = functools.partial(etree.XPath, namespaces=xpn)

        def get_attr(node, default=None, attr='playorder'):
            for name, val in node.attrib.items():
                if name and val and name.lower().endswith(attr):
                    return val
            return default

        nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
        txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
        content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
        np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')

        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = ''
                for txt in txt_path(nl):
                    text += etree.tostring(txt,
                                           method='text',
                                           encoding='unicode',
                                           with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)

        nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
        if not nm:
            raise ValueError('NCX files must have a <navmap> element.')
        nm = nm[0]

        for child in np_path(nm):
            process_navpoint(child, self)
Example #34
0
def clean_html(raw):
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars
    return clean_ascii_chars(
        xml_to_unicode(raw,
                       strip_encoding_pats=True,
                       resolve_entities=True,
                       assume_utf8=True)[0])
Example #35
0
def parse_html(raw):
    import html5lib
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.cleantext import clean_ascii_chars
    raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
                                resolve_entities=True, assume_utf8=True)[0])
    return html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False).getroot()
Example #36
0
    def fetch_raw(
            self,
            log,
            url,
            br,
            testing,  # {{{
            identifiers={},
            timeout=30):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        from lxml.html import tostring
        import html5lib
        try:
            raw = br.open_novisit(
                url, timeout=timeout).read().decode('gb18030').strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                            e.getcode() == 404:
                log.error('Query malformed: %r' % url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = _('DangDang timed out. Try again later.')
                log.error(msg)
            else:
                msg = 'Failed to make identify query: %r' % url
                log.exception(msg)
            return as_unicode(msg)

        raw = clean_ascii_chars(
            xml_to_unicode(raw,
                           strip_encoding_pats=True,
                           resolve_entities=True)[0])

        if testing:
            import tempfile
            with tempfile.NamedTemporaryFile(prefix='dangdang_results_',
                                             suffix='.html',
                                             delete=False) as f:
                f.write(raw.encode('utf-8'))
            print('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>对不起,您要访问的页面暂时没有找到' not in raw

        if found:
            try:
                root = html5lib.parse(raw,
                                      treebuilder='lxml',
                                      namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse DangDang page for query: %r' % url
                log.exception(msg)
                return msg

        return found, root
Example #37
0
 def __new__(cls,
             path,
             mime_type=None,
             read_anchor_map=True,
             run_char_count=True,
             from_epub=False,
             read_links=True):
     ppath = path.partition('#')[0]
     if not os.path.exists(path) and os.path.exists(ppath):
         path = ppath
     obj = super(SpineItem, cls).__new__(cls, path)
     with open(path, 'rb') as f:
         raw = f.read()
     if from_epub:
         # According to the spec, HTML in EPUB must be encoded in utf-8 or
         # utf-16. Furthermore, there exist epub files produced by the usual
         # incompetents that have utf-8 encoded HTML files that contain
         # incorrect encoding declarations. See
         # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
         # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
         # https://bugs.launchpad.net/bugs/1188843
         # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
         # is the same algorithm as that used by the conversion pipeline (modulo
         # some BOM based detection). Sigh.
         try:
             raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
         except UnicodeDecodeError:
             raw, obj.encoding = xml_to_unicode(raw)
     else:
         raw, obj.encoding = xml_to_unicode(raw)
     obj.character_count = character_count(raw) if run_char_count else 10000
     obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
     obj.all_links = all_links(raw) if read_links else set()
     obj.verified_links = set()
     obj.start_page = -1
     obj.pages = -1
     obj.max_page = -1
     obj.index_entries = []
     if mime_type is None:
         mime_type = guess_type(obj)[0]
     obj.mime_type = mime_type
     obj.is_single_page = None
     return obj
Example #38
0
    def get_details(self):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Amazon timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw),
                                  treebuilder='lxml',
                                  namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r' % self.url
            self.log.exception(msg)
            return
        if self.domain == 'jp':
            for a in root.xpath('//a[@href]'):
                if 'black-curtain-redirect.html' in a.get('href'):
                    self.url = 'http://amazon.co.jp' + a.get('href')
                    self.log('Black curtain redirect found, following')
                    return self.get_details()

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse amazon details page: %r' % self.url
            msg += self.tostring(errmsg, method='text',
                                 encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Example #39
0
    def convert_epub3_nav(self, nav_path, opf, log):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
Example #40
0
    def convert_epub3_nav(self, nav_path, opf, log):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src':href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
Example #41
0
def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    if 'amazon.com.br' in url:
        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
                url = 'http://amazon.co.jp'+a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Example #42
0
 def test_lxml_unicode_parsing(self):
     from calibre.ebooks.chardet import xml_to_unicode
     with open(
             os.path.join(os.path.dirname(os.path.abspath(__file__)),
                          'unicode-test.opf'), 'rb') as f:
         raw = f.read()
     text = xml_to_unicode(raw,
                           strip_encoding_pats=True,
                           resolve_entities=True,
                           assume_utf8=True)[0]
     self.assertIsNotNone(safe_xml_fromstring(text))
Example #43
0
    def read_ncx_toc(self, toc, root=None):
        self.base_path = os.path.dirname(toc)
        if root is None:
            raw  = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True,
                    strip_encoding_pats=True)[0]
            root = etree.fromstring(raw, parser=etree.XMLParser(recover=True,
                no_network=True))
        xpn = {'re': 'http://exslt.org/regular-expressions'}
        XPath = functools.partial(etree.XPath, namespaces=xpn)

        def get_attr(node, default=None, attr='playorder'):
            for name, val in node.attrib.items():
                if name and val and name.lower().endswith(attr):
                    return val
            return default

        nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
        txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
        content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
        np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')

        def process_navpoint(np, dest):
            try:
                play_order = int(get_attr(np, 1))
            except:
                play_order = 1
            href = fragment = text = None
            nd = dest
            nl = nl_path(np)
            if nl:
                nl = nl[0]
                text = u''
                for txt in txt_path(nl):
                    text += etree.tostring(txt, method='text',
                            encoding='unicode', with_tail=False)
                content = content_path(np)
                if content and text:
                    content = content[0]
                    # if get_attr(content, attr='src'):
                    purl = urlparse(content.get('src'))
                    href, fragment = unquote(purl[2]), unquote(purl[5])
                    nd = dest.add_item(href, fragment, text)
                    nd.play_order = play_order

            for c in np_path(np):
                process_navpoint(c, nd)

        nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
        if not nm:
            raise ValueError('NCX files must have a <navmap> element.')
        nm = nm[0]

        for child in np_path(nm):
            process_navpoint(child, self)
Example #44
0
def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    if 'amazon.com.br' in url:
        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
                url = 'http://amazon.co.jp'+a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Example #45
0
def parse_html(markup):
    from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
    from calibre.utils.cleantext import clean_xml_chars
    if isinstance(markup, unicode_type):
        markup = strip_encoding_declarations(markup)
        markup = substitute_entites(markup)
    else:
        markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
    markup = clean_xml_chars(markup)
    from html5_parser.soup import parse
    return parse(markup, return_root=False)
Example #46
0
    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        XPath = partial(etree.XPath, namespaces=NAMESPACES)
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Example #47
0
    def get_details(self):
        '''
        从书籍详情页获取书籍详情信息
        '''
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = '17k.com timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw),
                                  treebuilder='lxml',
                                  namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse 17k.com details page: %r' % self.url
            self.log.exception(msg)
            return

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse 17k.com details page: %r' % self.url
            msg += self.tostring(errmsg, method='text',
                                 encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Example #48
0
def render_cover(opf, opf_path, zf, reader=None):
    from calibre.ebooks import render_html_svg_workaround
    from calibre.utils.logging import default_log

    cpage = opf.first_spine_item()
    if not cpage:
        return
    if reader is not None and reader.encryption_meta.is_encrypted(cpage):
        return

    with TemporaryDirectory('_epub_meta') as tdir:
        with CurrentDir(tdir):
            zf.extractall()
            opf_path = opf_path.replace('/', os.sep)
            cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
            if not os.path.exists(cpage):
                return

            if isosx:
                # On OS X trying to render a HTML cover which uses embedded
                # fonts more than once in the same process causes a crash in Qt
                # so be safe and remove the fonts as well as any @font-face
                # rules
                for f in walk('.'):
                    if os.path.splitext(f)[1].lower() in ('.ttf', '.otf'):
                        os.remove(f)
                ffpat = re.compile(br'@font-face.*?{.*?}',
                                   re.DOTALL | re.IGNORECASE)
                with open(cpage, 'r+b') as f:
                    raw = f.read()
                    f.truncate(0)
                    f.seek(0)
                    raw = ffpat.sub(b'', raw)
                    f.write(raw)
                from calibre.ebooks.chardet import xml_to_unicode
                raw = xml_to_unicode(raw,
                                     strip_encoding_pats=True,
                                     resolve_entities=True)[0]
                from lxml import html
                for link in html.fromstring(raw).xpath('//link'):
                    href = link.get('href', '')
                    if href:
                        path = os.path.join(os.path.dirname(cpage), href)
                        if os.path.exists(path):
                            with open(path, 'r+b') as f:
                                raw = f.read()
                                f.truncate(0)
                                f.seek(0)
                                raw = ffpat.sub(b'', raw)
                                f.write(raw)

            return render_html_svg_workaround(cpage, default_log)
Example #49
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #50
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = clean_xml_chars(raw)
    root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #51
0
def parse_opf(stream_or_path):
    stream = stream_or_path
    if not hasattr(stream, 'read'):
        stream = open(stream, 'rb')
    raw = stream.read()
    if not raw:
        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
    raw = raw[raw.find('<'):]
    root = etree.fromstring(raw, PARSER)
    if root is None:
        raise ValueError('Not an OPF file')
    return root
Example #52
0
def parse_opf(stream_or_path):
    stream = stream_or_path
    if not hasattr(stream, 'read'):
        stream = open(stream, 'rb')
    raw = stream.read()
    if not raw:
        raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
    raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
    raw = raw[raw.find('<'):]
    root = etree.fromstring(raw, PARSER)
    if root is None:
        raise ValueError('Not an OPF file')
    return root
Example #53
0
    def get_details(self):
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        import html5lib

        try:
            raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Amazon timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%self.url
                self.log.exception(msg)
            return

        oraw = raw
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r'%self.url)
            return

        try:
            root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                    namespaceHTMLElements=False)
        except:
            msg = 'Failed to parse amazon details page: %r'%self.url
            self.log.exception(msg)
            return
        if self.domain == 'jp':
            for a in root.xpath('//a[@href]'):
                if 'black-curtain-redirect.html' in a.get('href'):
                    self.url = 'http://amazon.co.jp'+a.get('href')
                    self.log('Black curtain redirect found, following')
                    return self.get_details()

        errmsg = root.xpath('//*[@id="errorMessage"]')
        if errmsg:
            msg = 'Failed to parse amazon details page: %r'%self.url
            msg += self.tostring(errmsg, method='text', encoding=unicode).strip()
            self.log.error(msg)
            return

        self.parse_details(oraw, root)
Example #54
0
def render_cover(opf, opf_path, zf, reader=None):
    from calibre.ebooks import render_html_svg_workaround
    from calibre.utils.logging import default_log

    cpage = opf.first_spine_item()
    if not cpage:
        return
    if reader is not None and reader.encryption_meta.is_encrypted(cpage):
        return

    with TemporaryDirectory('_epub_meta') as tdir:
        with CurrentDir(tdir):
            zf.extractall()
            opf_path = opf_path.replace('/', os.sep)
            cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage)
            if not os.path.exists(cpage):
                return

            if isosx:
                # On OS X trying to render a HTML cover which uses embedded
                # fonts more than once in the same process causes a crash in Qt
                # so be safe and remove the fonts as well as any @font-face
                # rules
                for f in walk('.'):
                    if os.path.splitext(f)[1].lower() in ('.ttf', '.otf'):
                        os.remove(f)
                ffpat = re.compile(br'@font-face.*?{.*?}',
                        re.DOTALL|re.IGNORECASE)
                with open(cpage, 'r+b') as f:
                    raw = f.read()
                    f.truncate(0)
                    f.seek(0)
                    raw = ffpat.sub(b'', raw)
                    f.write(raw)
                from calibre.ebooks.chardet import xml_to_unicode
                raw = xml_to_unicode(raw,
                        strip_encoding_pats=True, resolve_entities=True)[0]
                from lxml import html
                for link in html.fromstring(raw).xpath('//link'):
                    href = link.get('href', '')
                    if href:
                        path = os.path.join(os.path.dirname(cpage), href)
                        if os.path.exists(path):
                            with open(path, 'r+b') as f:
                                raw = f.read()
                                f.truncate(0)
                                f.seek(0)
                                raw = ffpat.sub(b'', raw)
                                f.write(raw)

            return render_html_svg_workaround(cpage, default_log)
Example #55
0
 def __new__(cls, path, mime_type=None, read_anchor_map=True,
         run_char_count=True, from_epub=False, read_links=True):
     ppath = path.partition('#')[0]
     if not os.path.exists(path) and os.path.exists(ppath):
         path = ppath
     obj = super(SpineItem, cls).__new__(cls, path)
     with lopen(path, 'rb') as f:
         raw = f.read()
     if from_epub:
         # According to the spec, HTML in EPUB must be encoded in utf-8 or
         # utf-16. Furthermore, there exist epub files produced by the usual
         # incompetents that have utf-8 encoded HTML files that contain
         # incorrect encoding declarations. See
         # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2
         # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc
         # https://bugs.launchpad.net/bugs/1188843
         # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This
         # is the same algorithm as that used by the conversion pipeline (modulo
         # some BOM based detection). Sigh.
         try:
             raw, obj.encoding = raw.decode('utf-8'), 'utf-8'
         except UnicodeDecodeError:
             raw, obj.encoding = xml_to_unicode(raw)
     else:
         raw, obj.encoding = xml_to_unicode(raw)
     obj.character_count = character_count(raw) if run_char_count else 10000
     obj.anchor_map = anchor_map(raw) if read_anchor_map else {}
     obj.all_links = all_links(raw) if read_links else set()
     obj.verified_links = set()
     obj.start_page = -1
     obj.pages      = -1
     obj.max_page   = -1
     obj.index_entries = []
     if mime_type is None:
         mime_type = guess_type(obj)[0]
     obj.mime_type = mime_type
     obj.is_single_page = None
     return obj
Example #56
0
    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()
Example #57
0
    def __init__(self, stream, logger):
        self.logger = logger
        src = stream.read()
        self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0])
        self.objects = {}
        for obj in self.soup.findAll(objid=True):
            self.objects[obj['objid']] = obj

        self.parsed_objects = {}
        self.first_pass()
        self.second_pass()
        self.third_pass()
        self.fourth_pass()
        self.fifth_pass()
Example #58
0
def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(
            url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r' % url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r' % url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
                         resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r' % url)
        return

    try:
        root = html5lib.parse(raw,
                              treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r' % url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r' % url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Example #59
0
    def identify(self, log, result_queue, abort, title=None, authors=None,
                 identifiers={}, timeout=90):  # {{{
        from calibre.ebooks.chardet import xml_to_unicode
        from HTMLParser import HTMLParser
        from lxml import etree, html

        if not self.is_configured():
            return
        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        if not query:
            err = u'Insufficient metadata to construct query'
            log.error(err)
            return err

        try:
            raw = self.browser.open_novisit(query).read()
        except Exception as e:
            log.exception(u'Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
            entries_block = doc.xpath(u'//div[@class="bSearchResult"]')

            if entries_block:
                entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]')
                # for entry in entries:
                #   log.debug('entries %s' % entree.tostring(entry))
                metadata = self.get_metadata(log, entries, title, authors, identifiers)
                self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout)
            else:
                # Redirect page: trying to extract ozon_id from javascript data
                h = HTMLParser()
                entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True))))
                id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)')
                # result containing ozon_id and entry_title
                entry_info = re.search(id_title_pat, entry_string)
                ozon_id = entry_info.group(1) if entry_info else None
                entry_title = entry_info.group(2) if entry_info else None

                if ozon_id:
                    metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors)
                    identifiers['ozon'] = ozon_id
                    self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={})
                else:
                    log.error('No SearchResults in Ozon.ru response found')

        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
Example #60
0
        def fget(self):
            if self.compressed_info_size == 0:
                raise LRFException("This document has no meta info")
            size = self.compressed_info_size - 4
            self._file.seek(self.info_start)
            try:
                src =  zlib.decompress(self._file.read(size))
                if len(src) != self.uncompressed_info_size:
                    raise LRFException("Decompression of document meta info\
                                        yielded unexpected results")

                src = xml_to_unicode(src, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]
                return dom.parseString(src)
            except zlib.error:
                raise LRFException("Unable to decompress document meta information")