Ejemplo n.º 1
0
def search(query, max_results=10, timeout=60, write_raw_to=None):
    url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format(
        quote_plus(query))

    counter = max_results
    br = browser()
    raw = br.open(url).read()

    if write_raw_to is not None:
        with open(write_raw_to, 'wb') as f:
            f.write(raw)

    root = parse(raw)
    CSSSelect = Select(root)
    for li in CSSSelect('li.booklink'):
        if counter <= 0:
            break
        counter -= 1

        s = SearchResult()
        a = next(CSSSelect('a.link', li))
        s.detail_item = absurl(a.get('href'))
        s.title = etree.tostring(next(CSSSelect('span.title', li)),
                                 method='text',
                                 encoding='unicode').strip()
        try:
            s.author = etree.tostring(next(CSSSelect('span.subtitle', li)),
                                      method='text',
                                      encoding='unicode').strip()
        except StopIteration:
            s.author = ""
        for img in CSSSelect('img.cover-thumb', li):
            s.cover_url = absurl(img.get('src'))
            break

        # Get the formats and direct download links.
        details_doc = parse(br.open_novisit(s.detail_item).read())
        doc_select = Select(details_doc)
        for tr in doc_select('table.files tr[typeof="pgterms:file"]'):
            for a in doc_select('a.link', tr):
                href = a.get('href')
                type = a.get('type')
                ext = mimetypes.guess_extension(
                    type.split(';')[0]) if type else None
                if href and ext:
                    url = absurl(href.split('?')[0])
                    ext = ext[1:].upper().strip()
                    if ext not in s.downloads:
                        s.downloads[ext] = url
                    break

        s.formats = ', '.join(s.downloads.keys())
        if not s.formats:
            continue

        yield s
Ejemplo n.º 2
0
def search_kobo(query, max_results=10, timeout=60, write_html_to=None):
    from css_selectors import Select
    url = 'https://www.kobobooks.com/search/search.html?q=' + quote_plus(query)
    raw = read_url(url, timeout=timeout)
    if write_html_to is not None:
        with open(write_html_to, 'w') as f:
            f.write(raw)
    doc = html.fromstring(raw)
    select = Select(doc)
    for i, item in enumerate(select('.result-items .item-wrapper.book')):
        if i == max_results:
            break
        for img in select('.item-image img[src]', item):
            cover_url = img.get('src')
            if cover_url.startswith('//'):
                cover_url = 'https:' + cover_url
            break
        else:
            cover_url = None

        for p in select('h2.title', item):
            title = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            for a in select('a[href]', p):
                url = a.get('href')
                break
            else:
                url = None
            break
        else:
            title = None
        if title:
            for p in select('p.subtitle', item):
                title += ' - ' + etree.tostring(
                    p, method='text', encoding='unicode').strip()

        authors = []
        for a in select('.contributors a.contributor-name', item):
            authors.append(
                etree.tostring(a, method='text', encoding='unicode').strip())
        authors = authors_to_string(authors)

        for p in select('p.price', item):
            price = etree.tostring(p, method='text',
                                   encoding='unicode').strip()
            break
        else:
            price = None

        if title and authors and url:
            s = SearchResult()
            s.cover_url = cover_url
            s.title = title
            s.author = authors
            s.price = price
            s.detail_item = url
            s.formats = 'EPUB'
            s.drm = SearchResult.DRM_UNKNOWN

            yield s
Ejemplo n.º 3
0
    def get_details(self):

        if self.preparsed_root is None:
            raw, root, selector = parse_details_page(self.url, self.log, self.timeout, self.browser, self.domain)
        else:
            raw, root, selector = self.preparsed_root

        from css_selectors import Select
        self.selector = Select(root)
        self.parse_details(raw, root)
Ejemplo n.º 4
0
def parse_details_page(url, log, timeout, browser, domain):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    if 'amazon.com.br' in url:
        raw = raw.decode('utf-8')  # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml',
                namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return
    if domain == 'jp':
        for a in root.xpath('//a[@href]'):
            if 'black-curtain-redirect.html' in a.get('href'):
                url = 'http://amazon.co.jp'+a.get('href')
                log('Black curtain redirect found, following')
                return parse_details_page(url, log, timeout, browser, domain)

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Ejemplo n.º 5
0
def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(
            url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r' % url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r' % url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
                         resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r' % url)
        return

    try:
        root = html5lib.parse(raw,
                              treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r' % url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r' % url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
Ejemplo n.º 6
0
def find_first_matching_rule(container,
                             html_file_name,
                             raw_html,
                             class_data,
                             lnum_attr='data-lnum'):
    lnum, tags = class_data['sourceline_address']
    class_name = class_data['class']
    root = parse(raw_html,
                 decoder=lambda x: x.decode('utf-8'),
                 line_numbers=True,
                 linenumber_attribute=lnum_attr)
    tags_on_line = root.xpath(f'//*[@{lnum_attr}={lnum}]')
    barenames = [barename(tag.tag) for tag in tags_on_line]
    if barenames[:len(tags)] != tags:
        raise NoMatchingTagFound(
            f'No tag matching the specification was found in {html_file_name}')
    target_elem = tags_on_line[len(tags) - 1]
    select = Select(root, ignore_inappropriate_pseudo_classes=True)
    for tag in root.iter('*'):
        tn = barename(tag.tag)
        if tn == 'style' and tag.text and tag.get('type',
                                                  'text/css') == 'text/css':
            try:
                sheet = container.parse_css(tag.text)
            except Exception:
                continue
            res = find_first_rule_that_matches_elem(container, target_elem,
                                                    select, class_name,
                                                    sheet.cssRules,
                                                    html_file_name)
            if res is not None:
                return res._replace(style_tag_address=(int(tag.get(lnum_attr)),
                                                       ['style']))
        elif tn == 'link' and tag.get('href') and tag.get(
                'rel') == 'stylesheet':
            sname = container.href_to_name(tag.get('href'), html_file_name)
            try:
                sheet = container.parsed(sname)
            except Exception:
                continue
            if not hasattr(sheet, 'cssRules'):
                continue
            res = find_first_rule_that_matches_elem(container, target_elem,
                                                    select, class_name,
                                                    sheet.cssRules, sname)
            if res is not None:
                return res
    raise NoMatchingRuleFound(
        f'No CSS rules that apply to the specified tag in {html_file_name} with the class {class_name} found'
    )
Ejemplo n.º 7
0
    def parse(self, raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_only_date, UNDEFINED_DATE
        from css_selectors import Select
        root = parse_html(raw)
        selector = Select(root)
        sku = next(selector('div.sku.attGroup'))
        info = sku.getparent()
        top = info.getparent().getparent()
        banner = top.find('div')
        spans = banner.findall('span')
        title = ''
        for i, span in enumerate(spans):
            if i == 0 or '12pt' in span.get('style', ''):
                title += astext(span)
            else:
                break
        authors = [
            re.sub(r'\(.*\)', '', x).strip()
            for x in astext(spans[-1]).split(',')
        ]
        mi = Metadata(title.strip(), authors)

        # Identifiers
        isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
        for isbn in isbns:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, self.sku)
        isbns = sorted(isbns, key=lambda x: len(x) if x else 0, reverse=True)
        if isbns and isbns[0]:
            mi.isbn = isbns[0]
        mi.set_identifier('edelweiss', self.sku)

        # Tags
        bisac = tuple(selector('div.bisac.attGroup'))
        if bisac:
            bisac = astext(bisac[0])
            mi.tags = [x.strip() for x in bisac.split(',')]
            mi.tags = [
                t[1:].strip() if t.startswith('&') else t for t in mi.tags
            ]

        # Publisher
        pub = tuple(selector('div.supplier.attGroup'))
        if pub:
            pub = astext(pub[0])
            mi.publisher = pub

        # Pubdate
        pub = tuple(selector('div.shipDate.attGroupItem'))
        if pub:
            pub = astext(pub[0])
            parts = pub.partition(':')[0::2]
            pub = parts[1] or parts[0]
            try:
                if ', Ship Date:' in pub:
                    pub = pub.partition(', Ship Date:')[0]
                q = parse_only_date(pub, assume_utc=True)
                if q.year != UNDEFINED_DATE:
                    mi.pubdate = q
            except:
                self.log.exception('Error parsing published date: %r' % pub)

        # Comments
        comm = ''
        general = tuple(selector('div#pd-general-overview-content'))
        if general:
            q = self.render_comments(general[0])
            if q != '<p>No title summary available. </p>':
                comm += q
        general = tuple(selector('div#pd-general-contributor-content'))
        if general:
            comm += self.render_comments(general[0])
        general = tuple(selector('div#pd-general-quotes-content'))
        if general:
            comm += self.render_comments(general[0])
        if comm:
            mi.comments = comm

        # Cover
        img = tuple(selector('img.title-image[src]'))
        if img:
            href = img[0].get('src').replace('jacket_covers/medium/',
                                             'jacket_covers/flyout/')
            self.plugin.cache_identifier_to_cover_url(self.sku, href)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        return mi
Ejemplo n.º 8
0
    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        from urlparse import parse_qs

        book_url = self._get_book_url(identifiers.get('edelweiss', None))
        br = self.browser
        if book_url:
            entries = [(book_url, identifiers['edelweiss'])]
        else:
            entries = []
            query = self.create_query(log,
                                      title=title,
                                      authors=authors,
                                      identifiers=identifiers)
            if not query:
                log.error('Insufficient metadata to construct query')
                return
            log('Using query URL:', query)
            try:
                raw = br.open_novisit(query, timeout=timeout).read()
            except Exception as e:
                log.exception('Failed to make identify query: %r' % query)
                return as_unicode(e)

            try:
                root = parse_html(raw)
            except Exception as e:
                log.exception('Failed to parse identify results')
                return as_unicode(e)
            from css_selectors import Select
            select = Select(root)
            has_isbn = check_isbn(identifiers.get('isbn', None)) is not None
            if not has_isbn:
                author_tokens = set(x.lower() for x in self.get_author_tokens(
                    authors, only_first_author=True))
            for entry in select('div.listRow div.listRowMain'):
                a = entry.xpath(
                    'descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]'
                )
                if not a:
                    continue
                href = a[0].get('href')
                prefix, qs = href.partition('?')[0::2]
                sku = parse_qs(qs).get('sku', None)
                if sku and sku[0]:
                    sku = sku[0]
                    div = tuple(select('div.sku.attGroup'))
                    if div:
                        text = astext(div[0])
                        isbns = [
                            check_isbn(x.strip()) for x in text.split(',')
                        ]
                        for isbn in isbns:
                            if isbn:
                                self.cache_isbn_to_identifier(isbn, sku)
                    for img in entry.xpath(
                            'descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'
                    ):
                        self.cache_identifier_to_cover_url(
                            sku,
                            img.get('src').replace('/thumbnail/', '/flyout/'))

                    div = tuple(select('div.format.attGroup'))
                    text = astext(div[0]).lower()
                    if 'audio' in text or 'mp3' in text:  # Audio-book, ignore
                        continue
                    if not has_isbn:
                        # edelweiss returns matches based only on title, so we
                        # filter by author manually
                        div = tuple(select('div.contributor.attGroup'))
                        try:
                            entry_authors = set(
                                self.get_author_tokens([
                                    x.strip()
                                    for x in astext(div[0]).lower().split(',')
                                ]))
                        except IndexError:
                            entry_authors = set()
                        if not entry_authors.issuperset(author_tokens):
                            continue
                    entries.append((self._get_book_url(sku), sku))

        if (not entries and identifiers and title and authors
                and not abort.is_set()):
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        if not entries:
            return

        workers = [
            Worker(skul, url, i, result_queue, br.clone_browser(), timeout,
                   log, self) for i, (url, skul) in enumerate(entries[:5])
        ]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break
Ejemplo n.º 9
0
    def find_page_breaks(self, item):
        if self.page_break_selectors is None:
            self.page_break_selectors = set()
            stylesheets = [
                x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES
            ]
            for rule in rules(stylesheets):
                before = getattr(
                    rule.style.getPropertyCSSValue('page-break-before'),
                    'cssText', '').strip().lower()
                after = getattr(
                    rule.style.getPropertyCSSValue('page-break-after'),
                    'cssText', '').strip().lower()
                try:
                    if before and before not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add(
                            (rule.selectorText, True))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-before')
                except:
                    pass
                try:
                    if after and after not in {'avoid', 'auto', 'inherit'}:
                        self.page_break_selectors.add(
                            (rule.selectorText, False))
                        if self.remove_css_pagebreaks:
                            rule.style.removeProperty('page-break-after')
                except:
                    pass
        page_breaks = set()
        select = Select(item.data)
        if not self.page_break_selectors:
            return [], []
        body = item.data.xpath('//h:body', namespaces=NAMESPACES)
        if not body:
            return [], []
        descendants = frozenset(body[0].iterdescendants('*'))

        for selector, before in self.page_break_selectors:
            try:
                for elem in select(selector):
                    if elem in descendants and elem.tag.rpartition(
                            '}')[2].lower() not in {
                                'html', 'body', 'head', 'style', 'script',
                                'meta', 'link'
                            }:
                        elem.set('pb_before', '1' if before else '0')
                        page_breaks.add(elem)
            except SelectorError as err:
                self.log.warn(
                    'Ignoring page breaks specified with invalid CSS selector: %r (%s)'
                    % (selector, as_unicode(err)))

        for i, elem in enumerate(item.data.iter('*')):
            try:
                elem.set('pb_order', str(i))
            except TypeError:  # Cant set attributes on comment nodes etc.
                continue

        page_breaks = list(page_breaks)
        page_breaks.sort(key=lambda x: int(x.get('pb_order')))
        page_break_ids, page_breaks_ = [], []
        for i, x in enumerate(page_breaks):
            x.set('id', x.get('id', 'calibre_pb_%d' % i))
            id = x.get('id')
            try:
                xp = XPath('//*[@id="%s"]' % id)
            except:
                try:
                    xp = XPath("//*[@id='%s']" % id)
                except:
                    # The id has both a quote and an apostrophe or some other
                    # Just replace it since I doubt its going to work anywhere else
                    # either
                    id = 'calibre_pb_%d' % i
                    x.set('id', id)
                    xp = XPath('//*[@id=%r]' % id)
            page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
            page_break_ids.append(id)

        for elem in item.data.iter(etree.Element):
            elem.attrib.pop('pb_order', False)
            elem.attrib.pop('pb_before', False)

        return page_breaks_, page_break_ids
Ejemplo n.º 10
0
def css_data(container, book_locale, result_data, *args):
    import tinycss
    from tinycss.css21 import RuleSet, ImportRule

    def css_rules(file_name, rules, sourceline=0):
        ans = []
        for rule in rules:
            if isinstance(rule, RuleSet):
                selector = rule.selector.as_css()
                ans.append(
                    CSSRule(
                        selector,
                        RuleLocation(file_name, sourceline + rule.line,
                                     rule.column)))
            elif isinstance(rule, ImportRule):
                import_name = safe_href_to_name(container, rule.uri, file_name)
                if import_name and container.exists(import_name):
                    ans.append(import_name)
            elif getattr(rule, 'rules', False):
                ans.extend(css_rules(file_name, rule.rules, sourceline))
        return ans

    parser = tinycss.make_full_parser()
    importable_sheets = {}
    html_sheets = {}
    spine_names = {name for name, is_linear in container.spine_names}
    style_path, link_path = XPath('//h:style'), XPath('//h:link/@href')

    for name, mt in iteritems(container.mime_map):
        if mt in OEB_STYLES:
            importable_sheets[name] = css_rules(
                name,
                parser.parse_stylesheet(container.raw_data(name)).rules)
        elif mt in OEB_DOCS and name in spine_names:
            html_sheets[name] = []
            for style in style_path(container.parsed(name)):
                if style.get('type', 'text/css') == 'text/css' and style.text:
                    html_sheets[name].append(
                        css_rules(
                            name,
                            parser.parse_stylesheet(
                                force_unicode(style.text, 'utf-8')).rules,
                            style.sourceline - 1))

    rule_map = defaultdict(lambda: defaultdict(list))

    def rules_in_sheet(sheet):
        for rule in sheet:
            if isinstance(rule, CSSRule):
                yield rule
            else:  # @import rule
                isheet = importable_sheets.get(rule)
                if isheet is not None:
                    for irule in rules_in_sheet(isheet):
                        yield irule

    def sheets_for_html(name, root):
        for href in link_path(root):
            tname = safe_href_to_name(container, href, name)
            sheet = importable_sheets.get(tname)
            if sheet is not None:
                yield sheet

    tt_cache = {}

    def tag_text(elem):
        ans = tt_cache.get(elem)
        if ans is None:
            tag = elem.tag.rpartition('}')[-1]
            if elem.attrib:
                attribs = ' '.join(
                    '%s="%s"' %
                    (k, prepare_string_for_xml(elem.get(k, ''), True))
                    for k in elem.keys())
                return '<%s %s>' % (tag, attribs)
            ans = tt_cache[elem] = '<%s>' % tag

    def matches_for_selector(selector, select, class_map, rule):
        lsel = selector.lower()
        try:
            matches = tuple(select(selector))
        except SelectorError:
            return ()
        for elem in matches:
            for cls in elem.get('class', '').split():
                if '.' + cls.lower() in lsel:
                    class_map[cls][elem].append(rule)

        return (MatchLocation(tag_text(elem), elem.sourceline)
                for elem in matches)

    class_map = defaultdict(lambda: defaultdict(list))

    for name, inline_sheets in iteritems(html_sheets):
        root = container.parsed(name)
        cmap = defaultdict(lambda: defaultdict(list))
        for elem in root.xpath('//*[@class]'):
            for cls in elem.get('class', '').split():
                cmap[cls][elem] = []
        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        for sheet in chain(sheets_for_html(name, root), inline_sheets):
            for rule in rules_in_sheet(sheet):
                rule_map[rule][name].extend(
                    matches_for_selector(rule.selector, select, cmap, rule))
        for cls, elem_map in iteritems(cmap):
            class_elements = class_map[cls][name]
            for elem, usage in iteritems(elem_map):
                class_elements.append(
                    ClassElement(name, elem.sourceline, elem.get('class'),
                                 tag_text(elem), tuple(usage)))

    result_data['classes'] = ans = []
    for cls, name_map in iteritems(class_map):
        la = tuple(
            ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name))
            for name, class_elements in iteritems(name_map) if class_elements)
        num_of_matches = sum(
            sum(len(ce.matched_rules) for ce in cfm.class_elements)
            for cfm in la)
        ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls)))

    ans = []
    for rule, loc_map in iteritems(rule_map):
        la = tuple(
            CSSFileMatch(name, tuple(locations), numeric_sort_key(name))
            for name, locations in iteritems(loc_map) if locations)
        count = sum(len(fm.locations) for fm in la)
        ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector)))

    return ans
Ejemplo n.º 11
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                        stylesheet.cssRules.remove(rule)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in
                             xrange(rule.media.length)}
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {u'mobi', u'docx'}:
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Ejemplo n.º 12
0
def remove_unused_css(container,
                      report=None,
                      remove_unused_classes=False,
                      merge_rules=False,
                      merge_rules_with_identical_properties=False):
    '''
    Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.

    :param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
    :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
    :param merge_rules: If True, rules with identical selectors are merged.
    '''
    report = report or (lambda x: x)

    def safe_parse(name):
        try:
            return container.parsed(name)
        except TypeError:
            pass

    sheets = {
        name: safe_parse(name)
        for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES
    }
    sheets = {k: v for k, v in iteritems(sheets) if v is not None}
    num_merged = num_rules_merged = 0
    if merge_rules:
        for name, sheet in iteritems(sheets):
            num = merge_identical_selectors(sheet)
            if num:
                container.dirty(name)
                num_merged += num
    if merge_rules_with_identical_properties:
        for name, sheet in iteritems(sheets):
            num = merge_identical_properties(sheet)
            if num:
                container.dirty(name)
                num_rules_merged += num
    import_map = {
        name: get_imported_sheets(name, container, sheets)
        for name in sheets
    }
    if remove_unused_classes:
        class_map = {
            name: {icu_lower(x)
                   for x in classes_in_rule_list(sheet.cssRules)}
            for name, sheet in iteritems(sheets)
        }
    style_rules = {
        name: tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
        for name, sheet in iteritems(sheets)
    }

    removal_stats = {'rules': 0, 'selectors': 0}
    num_of_removed_classes = 0

    for name, mt in iteritems(container.mime_map):
        if mt not in OEB_DOCS:
            continue
        root = container.parsed(name)
        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        used_classes = set()
        for style in root.xpath('//*[local-name()="style"]'):
            if style.get('type', 'text/css') == 'text/css' and style.text:
                sheet = container.parse_css(style.text)
                if merge_rules:
                    num = merge_identical_selectors(sheet)
                    if num:
                        num_merged += num
                        container.dirty(name)
                if merge_rules_with_identical_properties:
                    num = merge_identical_properties(sheet)
                    if num:
                        num_rules_merged += num
                        container.dirty(name)
                if remove_unused_classes:
                    used_classes |= {
                        icu_lower(x)
                        for x in classes_in_rule_list(sheet.cssRules)
                    }
                imports = get_imported_sheets(name,
                                              container,
                                              sheets,
                                              sheet=sheet)
                for imported_sheet in imports:
                    mark_used_selectors(style_rules[imported_sheet],
                                        container.log, select)
                    if remove_unused_classes:
                        used_classes |= class_map[imported_sheet]
                rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
                if mark_used_selectors(rules, container.log, select):
                    remove_unused_selectors_and_rules(sheet.cssRules, rules,
                                                      removal_stats)
                    style.text = force_unicode(sheet.cssText, 'utf-8')
                    pretty_script_or_style(container, style)
                    container.dirty(name)

        for link in root.xpath('//*[local-name()="link" and @href]'):
            sname = container.href_to_name(link.get('href'), name)
            if sname not in sheets:
                continue
            mark_used_selectors(style_rules[sname], container.log, select)
            if remove_unused_classes:
                used_classes |= class_map[sname]

            for iname in import_map[sname]:
                mark_used_selectors(style_rules[iname], container.log, select)
                if remove_unused_classes:
                    used_classes |= class_map[iname]

        if remove_unused_classes:
            for elem in root.xpath('//*[@class]'):
                original_classes, classes = elem.get('class', '').split(), []
                for x in original_classes:
                    if icu_lower(x) in used_classes:
                        classes.append(x)
                if len(classes) != len(original_classes):
                    if classes:
                        elem.set('class', ' '.join(classes))
                    else:
                        del elem.attrib['class']
                    num_of_removed_classes += len(original_classes) - len(
                        classes)
                    container.dirty(name)

    for name, sheet in iteritems(sheets):
        any_found = remove_unused_selectors_and_rules(sheet.cssRules,
                                                      style_rules[name],
                                                      removal_stats)
        if any_found:
            container.dirty(name)

    num_changes = num_merged + num_of_removed_classes + num_rules_merged + removal_stats[
        'rules'] + removal_stats['selectors']
    if num_changes > 0:
        if removal_stats['rules']:
            report(
                ngettext('Removed one unused CSS style rule',
                         'Removed {} unused CSS style rules',
                         removal_stats['rules']).format(
                             removal_stats['rules']))
        if removal_stats['selectors']:
            report(
                ngettext('Removed one unused CSS selector',
                         'Removed {} unused CSS selectors',
                         removal_stats['selectors']).format(
                             removal_stats['selectors']))
        if num_of_removed_classes > 0:
            report(
                ngettext(
                    'Removed one unused class from the HTML',
                    'Removed {} unused classes from the HTML',
                    num_of_removed_classes).format(num_of_removed_classes))
        if num_merged > 0:
            report(
                ngettext('Merged one CSS style rule with identical selectors',
                         'Merged {} CSS style rules with identical selectors',
                         num_merged).format(num_merged))
        if num_rules_merged > 0:
            report(
                ngettext(
                    'Merged one CSS style rule with identical properties',
                    'Merged {} CSS style rules with identical properties',
                    num_rules_merged).format(num_rules_merged))
    if not removal_stats['rules']:
        report(_('No unused CSS style rules found'))
    if not removal_stats['selectors']:
        report(_('No unused CSS selectors found'))
    if remove_unused_classes and num_of_removed_classes == 0:
        report(_('No unused class attributes found'))
    if merge_rules and num_merged == 0:
        report(_('No style rules that could be merged found'))
    return num_changes > 0
Ejemplo n.º 13
0
def resolve_styles(container, name, select=None, sheet_callback=None):
    root = container.parsed(name)
    select = select or Select(root, ignore_inappropriate_pseudo_classes=True)
    style_map = defaultdict(list)
    pseudo_style_map = defaultdict(list)
    rule_index_counter = count()
    pseudo_pat = re.compile(
        u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)

    def process_sheet(sheet, sheet_name):
        if sheet_callback is not None:
            sheet_callback(sheet, sheet_name)
        for rule, sheet_name, rule_index in iterrules(
                container,
                sheet_name,
                rules=sheet,
                rule_index_counter=rule_index_counter,
                rule_type='STYLE_RULE'):
            for selector in rule.selectorList:
                text = selector.selectorText
                try:
                    matches = tuple(select(text))
                except SelectorError as err:
                    container.log.error(
                        'Ignoring CSS rule with invalid selector: %r (%s)' %
                        (text, as_unicode(err)))
                    continue
                m = pseudo_pat.search(text)
                style = normalize_style_declaration(rule.style, sheet_name)
                if m is None:
                    for elem in matches:
                        style_map[elem].append(
                            StyleDeclaration(specificity(rule_index, selector),
                                             style, None))
                else:
                    for elem in matches:
                        pseudo_style_map[elem].append(
                            StyleDeclaration(specificity(rule_index, selector),
                                             style, m.group(1)))

    process_sheet(html_css_stylesheet(container), 'user-agent.css')

    for elem in root.iterdescendants(XHTML('style'), XHTML('link')):
        if elem.tag.lower().endswith('style'):
            if not elem.text:
                continue
            sheet = container.parse_css(elem.text)
            sheet_name = name
        else:
            if (elem.get('type') or 'text/css').lower() not in OEB_STYLES or \
                    (elem.get('rel') or 'stylesheet').lower() != 'stylesheet' or \
                    not media_ok(elem.get('media')):
                continue
            href = elem.get('href')
            if not href:
                continue
            sheet_name = container.href_to_name(href, name)
            if not container.has_name(sheet_name):
                continue
            sheet = container.parsed(sheet_name)
            if not isinstance(sheet, CSSStyleSheet):
                continue
        process_sheet(sheet, sheet_name)

    for elem in root.xpath('//*[@style]'):
        text = elem.get('style')
        if text:
            style = container.parse_css(text, is_declaration=True)
            style_map[elem].append(
                StyleDeclaration(Specificity(1, 0, 0, 0, 0),
                                 normalize_style_declaration(style, name),
                                 None))

    for l in (style_map, pseudo_style_map):
        for x in l.itervalues():
            x.sort(key=itemgetter(0), reverse=True)

    style_map = {
        elem: resolve_declarations(x)
        for elem, x in style_map.iteritems()
    }
    pseudo_style_map = {
        elem: resolve_pseudo_declarations(x)
        for elem, x in pseudo_style_map.iteritems()
    }

    return partial(resolve_property,
                   style_map), partial(resolve_pseudo_property, style_map,
                                       pseudo_style_map), select
Ejemplo n.º 14
0
def remove_unused_css(container, report=None, remove_unused_classes=False):
    '''
    Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.

    :param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
    :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
    '''
    report = report or (lambda x: x)

    def safe_parse(name):
        try:
            return container.parsed(name)
        except TypeError:
            pass

    sheets = {
        name: safe_parse(name)
        for name, mt in container.mime_map.iteritems() if mt in OEB_STYLES
    }
    sheets = {k: v for k, v in sheets.iteritems() if v is not None}
    import_map = {
        name: get_imported_sheets(name, container, sheets)
        for name in sheets
    }
    if remove_unused_classes:
        class_map = {
            name: {icu_lower(x)
                   for x in classes_in_rule_list(sheet.cssRules)}
            for name, sheet in sheets.iteritems()
        }
    style_rules = {
        name: tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
        for name, sheet in sheets.iteritems()
    }

    num_of_removed_rules = num_of_removed_classes = 0

    for name, mt in container.mime_map.iteritems():
        if mt not in OEB_DOCS:
            continue
        root = container.parsed(name)
        select = Select(root, ignore_inappropriate_pseudo_classes=True)
        used_classes = set()
        for style in root.xpath('//*[local-name()="style"]'):
            if style.get('type', 'text/css') == 'text/css' and style.text:
                sheet = container.parse_css(style.text)
                if remove_unused_classes:
                    used_classes |= {
                        icu_lower(x)
                        for x in classes_in_rule_list(sheet.cssRules)
                    }
                imports = get_imported_sheets(name,
                                              container,
                                              sheets,
                                              sheet=sheet)
                for imported_sheet in imports:
                    style_rules[imported_sheet] = tuple(
                        filter_used_rules(style_rules[imported_sheet],
                                          container.log, select))
                    if remove_unused_classes:
                        used_classes |= class_map[imported_sheet]
                rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
                unused_rules = tuple(
                    filter_used_rules(rules, container.log, select))
                if unused_rules:
                    num_of_removed_rules += len(unused_rules)
                    [sheet.cssRules.remove(r) for r in unused_rules]
                    style.text = force_unicode(sheet.cssText, 'utf-8')
                    pretty_script_or_style(container, style)
                    container.dirty(name)

        for link in root.xpath('//*[local-name()="link" and @href]'):
            sname = container.href_to_name(link.get('href'), name)
            if sname not in sheets:
                continue
            style_rules[sname] = tuple(
                filter_used_rules(style_rules[sname], container.log, select))
            if remove_unused_classes:
                used_classes |= class_map[sname]

            for iname in import_map[sname]:
                style_rules[iname] = tuple(
                    filter_used_rules(style_rules[iname], container.log,
                                      select))
                if remove_unused_classes:
                    used_classes |= class_map[iname]

        if remove_unused_classes:
            for elem in root.xpath('//*[@class]'):
                original_classes, classes = elem.get('class', '').split(), []
                for x in original_classes:
                    if icu_lower(x) in used_classes:
                        classes.append(x)
                if len(classes) != len(original_classes):
                    if classes:
                        elem.set('class', ' '.join(classes))
                    else:
                        del elem.attrib['class']
                    num_of_removed_classes += len(original_classes) - len(
                        classes)
                    container.dirty(name)

    for name, sheet in sheets.iteritems():
        unused_rules = style_rules[name]
        if unused_rules:
            num_of_removed_rules += len(unused_rules)
            [sheet.cssRules.remove(r) for r in unused_rules]
            container.dirty(name)

    if num_of_removed_rules > 0:
        report(
            ngettext('Removed %d unused CSS style rule',
                     'Removed %d unused CSS style rules', num_of_removed_rules)
            % num_of_removed_rules)
    else:
        report(_('No unused CSS style rules found'))
    if remove_unused_classes:
        if num_of_removed_classes > 0:
            report(
                ngettext('Removed %d unused class from the HTML',
                         'Removed %d unused classes from the HTML',
                         num_of_removed_classes) % num_of_removed_classes)
        else:
            report(_('No unused class attributes found'))
    return num_of_removed_rules + num_of_removed_classes > 0