Exemple #1
0
def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup

    soup = BeautifulSoup(raw)
    matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
    images = soup.findAll("img")
    if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
        img = images[0]
        img = os.path.join(base, *img["src"].split("/"))
        if os.path.exists(img):
            return open(img, "rb").read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find("body")
        if body is not None:
            text = u"".join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll("img", src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]["src"].split("/"))
                if os.path.exists(img):
                    return open(img, "rb").read()
Exemple #2
0
def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()
Exemple #3
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Exemple #4
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer=''
                    )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key+'_label'] = escape(display_name)
            except:
                pass

        # Used in the comment describing use of custom columns in templates
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        generated_html = P('jacket/template.xhtml',
                data=True).decode('utf-8').format(**args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
Exemple #5
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
def existing_annotations(parent, field, return_all=False):
    '''
    Return count of existing annotations, or existence of any
    '''
    import calibre_plugins.annotations.config as cfg
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP['id']
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup.find('div', 'user_annotations') is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log_location("Identified %d annotated books of %d total books" %
                (len(annotation_map), len(db.data)))
        return annotation_map
Exemple #7
0
def save_soup(soup, target):
    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
def existing_annotations(parent, field, return_all=False):
    """
    Return count of existing annotations, or existence of any
    """
    # import calibre_plugins.marvin_manager.config as cfg
    _log_location(field)
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP["id"]
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == "Comments":
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"])
            if soup.find("div", "user_annotations") is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data)))

        _log("annotation_map: %s" % repr(annotation_map))
    else:
        _log("no active field")

    return annotation_map
Exemple #9
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class':'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class':'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
 def find_all_annotated_books(self):
     '''
     Find all annotated books in library
     '''
     self._log_location("field: {0}".format(self.field))
     cids = self.cdb.search_getting_ids('formats:EPUB', '')
     for cid in cids:
         mi = self.cdb.get_metadata(cid, index_is_id=True)
         raw = mi.get_user_metadata(self.field, False)
         if raw['#value#'] is not None:
             soup = BeautifulSoup(raw['#value#'])
             if soup.find('div', 'user_annotations') is not None:
                 self.annotation_map.append(mi.id)
 def _remove_old_style(self, html):
     '''
     Remove the old style tag, finalize soup in preparation for styling
     '''
     unstyled_soup = BeautifulSoup(html)
     head = unstyled_soup.find("head")
     voc = unstyled_soup.body.find('div', {'class': 'vocabulary'})
     tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a)
     dart = random.randrange(len(tds))
     self.td = tds[dart]
     self.oh = self.td.a['href']
     self.td.a['href'] = self._finalize()
     old_style = head.find('style')
     if old_style:
         old_style.extract()
     return unstyled_soup
 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return(html)
Exemple #13
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode_type):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup('<div>' + comments + '</div>').find('div')
    result = BeautifulSoup('<div>')
    container = result.find('div')
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if isinstance(token,  (CData, Comment, Declaration, ProcessingInstruction)):
            continue
        if isinstance(token, NavigableString):
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                container.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            container.insert(rtc, token)
            rtc += 1

    if open_pTag:
        container.insert(rtc, pTag)

    for p in container.findAll('p'):
        p['class'] = 'description'

    return container.decode_contents().replace('<br></br>', '<br>')
def move_annotations(
    parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations"
):
    """
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    """
    import calibre_plugins.marvin_manager.config as cfg

    _log_location(annotation_map)
    _log(" %s -> %s" % (old_destination_field, new_destination_field))

    db = parent.opts.gui.current_db
    id = db.FIELD_MAP["id"]

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label("{:^100}".format("Moving annotations for %d books" % total_books))
    pb.show()

    transient_db = "transient"

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get(
            "COMMENTS_DIVIDER", "&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;"
        )
    )

    for cid in annotation_map:
        mi = db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == "Comments" and new_destination_field.startswith("#"):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find("div", "comments_divider")
                    if cd:
                        cd.extract()

                    # Save stripped Comments
                    mi.comments = unicode(old_soup)

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add user_annotations to destination
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record with stripped Comments, populated custom field
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith("#") and new_destination_field == "Comments":
            if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um["#value#"] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add user_annotations to Comments
                    if mi.comments is None:
                        mi.comments = unicode(new_soup)
                    else:
                        mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup)

                    # Update the record with stripped custom field, updated Comments
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith("#") and new_destination_field.startswith("#"):

            if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um["#value#"] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif old_destination_field == new_destination_field:
            pb.set_label("{:^100}".format("Updating annotations for %d books" % total_books))

            if new_destination_field == "Comments":
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find("div", "user_annotations")
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find("div", "comments_divider")
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        parent.opts.db.capture_content(uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                        # Add user_annotations to Comments
                        if mi.comments is None:
                            mi.comments = unicode(new_soup)
                        else:
                            mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments
                        db.set_metadata(
                            cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                        )
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add stripped old_soup plus new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(old_soup) + unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

    # Hide the progress bar
    pb.hide()

    # Get the eligible custom fields
    all_custom_fields = db.custom_field_keys()
    custom_fields = {}
    for cf in all_custom_fields:
        field_md = db.metadata_for_field(cf)
        if field_md["datatype"] in ["comments"]:
            custom_fields[field_md["name"]] = {"field": cf, "datatype": field_md["datatype"]}

    # Change field value to friendly name
    if old_destination_field.startswith("#"):
        for cf in custom_fields:
            if custom_fields[cf]["field"] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith("#"):
        for cf in custom_fields:
            if custom_fields[cf]["field"] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if old_destination_field == new_destination_field:
        msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map)
    else:
        msg = "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % (
            len(annotation_map),
            old_destination_field,
            new_destination_field,
        )
    if len(annotation_map) == 1:
        msg = msg.format("book")
    else:
        msg = msg.format("books")
    MessageBox(MessageBox.INFO, "", msg=msg, show_copy_button=False, parent=parent.gui).exec_()
    _log("INFO: %s" % msg)

    # Update the UI
    updateCalibreGUIView()
Exemple #15
0
    def _reformat(self, data, htmlpath):
        if self.input_encoding:
            data = data.decode(self.input_encoding)
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
        except ValueError:
            # hit some strange encoding problems...
            self.log.exception("Unable to parse html for cleaning, leaving it")
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # See if everything is inside a <head> tag
        # https://bugs.launchpad.net/bugs/1273512
        body = soup.find('body')
        if body is not None and body.parent.name == 'head':
            html = soup.find('html')
            html.insert(len(html), body)

        # remove forward and back nav bars from the top/bottom of each page
        # cos they really f**k with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        # only remove the tables, if they have an image with an alt attribute
        # containing prev, next or team
        t = soup('table')
        if t:
            if (t[0].previousSibling is None
                    or t[0].previousSibling.previousSibling is None):
                try:
                    alt = t[0].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find(
                            'next') != -1 or alt.find('team') != -1:
                        t[0].extract()
                except:
                    pass
            if (t[-1].nextSibling is None
                    or t[-1].nextSibling.nextSibling is None):
                try:
                    alt = t[-1].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find(
                            'next') != -1 or alt.find('team') != -1:
                        t[-1].extract()
                except:
                    pass
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()

        # some images seem to be broken in some chm's :/
        base = os.path.dirname(htmlpath)
        for img in soup('img', src=True):
            src = img['src']
            ipath = os.path.join(base, *src.split('/'))
            if os.path.exists(ipath):
                continue
            src = src.split(';')[0]
            if not src:
                continue
            ipath = os.path.join(base, *src.split('/'))
            if not os.path.exists(ipath):
                while src.startswith('../'):
                    src = src[3:]
            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element
            tables = soup.body.findAll('table', recursive=False)
            if tables and len(tables) == 1:
                trs = tables[0].findAll('tr', recursive=False)
                if trs and len(trs) == 1:
                    tds = trs[0].findAll('td', recursive=False)
                    if tds and len(tds) == 1:
                        tdContents = tds[0].contents
                        tableIdx = soup.body.contents.index(tables[0])
                        tables[0].extract()
                        while tdContents:
                            soup.body.insert(tableIdx, tdContents.pop())
        except:
            pass
        # do not prettify, it would reformat the <pre> tags!
        try:
            ans = str(soup)
            self.re_encoded_files.add(os.path.abspath(htmlpath))
            return ans
        except RuntimeError:
            return data
Exemple #16
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'),
                    pubdate=pubdate,
                    series_label=_('Series'),
                    series=series,
                    rating_label=_('Rating'),
                    rating=rating,
                    tags_label=_('Tags'),
                    tags=tags,
                    comments=comments,
                    footer='')
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key + '_label'] = escape(display_name)
            except:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        generated_html = P('jacket/template.xhtml',
                           data=True).decode('utf-8').format(**args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class': 'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class': 'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class': 'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
            soup.renderContents('utf-8').decode('utf-8'))
Exemple #17
0
    def generate_html(comments):
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_('Published'),
            pubdate=pubdate,
            series_label=_('Series'),
            series=series,
            rating_label=_('Rating'),
            rating=rating,
            tags_label=_('Tags'),
            tags=tags,
            comments=comments,
            footer='',
            searchable_tags=' '.join(
                escape(t) + 'ttt' for t in tags.tags_list),
        )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(
                        mi.get(key),
                        m.get('display', {}).get('allow_half_stars', False))
                else:
                    args[dkey] = escape(val)
                args[dkey + '_label'] = escape(display_name)
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in list(args.keys()):
                if key.startswith('_') and not key.endswith('_label'):
                    print((" %s: %s" % ('#' + key[1:], args[key])))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class': 'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class': 'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class': 'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
            soup.renderContents('utf-8').decode('utf-8'))
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'

    if False:
        '''
        Older technique: Use hashes to merge annotations
        '''
        #Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])

        updates = list(new_hashes.difference(old_hashes))
        if len(updates) and ouas is not None:
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            for new_annotation_id in updates:
                new_annotation = new_soup.find('div', {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup))
            else:
                merged_soup = unicode(sort_merged_annotations(regurgitated_soup))
        else:
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(new_soup)
            else:
                merged_soup = unicode(new_soup)
        return merged_soup

    else:
        '''
        Newer technique: Use timestamps to merge annotations
        '''
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            #print("sua: %s" % sua.prettify())
            timestamp = sua.find('td', 'timestamp')['uts']
            timestamps[timestamp] = {'stored_hash': sua['hash']}

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate annotations with current CSS
            rerendered_annotations = parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)
            regurgitated_soup = BeautifulSoup(rerendered_annotations)

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            timestamp = dua.find('td', 'timestamp')['uts']
            if timestamp in timestamps:
                timestamps[timestamp]['device_hash'] = dua['hash']
            else:
                timestamps[timestamp] = {'device_hash': dua['hash']}

        merged_annotations = Tag(BeautifulSoup(), 'div',
            [('class', "user_annotations"), ('style','margin:0')])

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_annotations.append(annotation)

        merged_annotations = sort_merged_annotations(merged_annotations)

        # Update new_soup with merged_annotations
        new_soup_uas = new_soup.find('div', 'user_annotations')
        new_soup_uas.replaceWith(merged_annotations)

        return unicode(new_soup)
Exemple #19
0
    def _reformat(self, data, htmlpath):
        if self.input_encoding:
            data = data.decode(self.input_encoding)
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
        except ValueError:
            # hit some strange encoding problems...
            self.log.exception("Unable to parse html for cleaning, leaving it")
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # See if everything is inside a <head> tag
        # https://bugs.launchpad.net/bugs/1273512
        body = soup.find('body')
        if body is not None and body.parent.name == 'head':
            html = soup.find('html')
            html.insert(len(html), body)

        # remove forward and back nav bars from the top/bottom of each page
        # cos they really f**k with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        # only remove the tables, if they have an image with an alt attribute
        # containing prev, next or team
        t = soup('table')
        if t:
            if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None):
                try:
                    alt = t[0].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
                        t[0].extract()
                except:
                    pass
            if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None):
                try:
                    alt = t[-1].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
                        t[-1].extract()
                except:
                    pass
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()

        # some images seem to be broken in some chm's :/
        base = os.path.dirname(htmlpath)
        for img in soup('img', src=True):
            src = img['src']
            ipath = os.path.join(base, *src.split('/'))
            if os.path.exists(ipath):
                continue
            src = src.split(';')[0]
            if not src:
                continue
            ipath = os.path.join(base, *src.split('/'))
            if not os.path.exists(ipath):
                while src.startswith('../'):
                    src = src[3:]
            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element
            tables = soup.body.findAll('table', recursive=False)
            if tables and len(tables) == 1:
                trs = tables[0].findAll('tr', recursive=False)
                if trs and len(trs) == 1:
                    tds = trs[0].findAll('td', recursive=False)
                    if tds and len(tds) == 1:
                        tdContents = tds[0].contents
                        tableIdx = soup.body.contents.index(tables[0])
                        tables[0].extract()
                        while tdContents:
                            soup.body.insert(tableIdx, tdContents.pop())
        except:
            pass
        # do not prettify, it would reformat the <pre> tags!
        try:
            ans = str(soup)
            self.re_encoded_files.add(os.path.abspath(htmlpath))
            return ans
        except RuntimeError:
            return data
Exemple #20
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return '<p></p>'
    if not isinstance(comments, str):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [
            '<p class="description">%s</p>' % x.replace('\n', '<br />')
            for x in comments.split('\n\n')
        ]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return '<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(
        lambda m: m.group().replace('.', '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(
            lost_cr.group(), '%s%s\n\n%s' %
            (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3)))

    comments = comments.replace('\r', '')
    # Convert \n\n to <p>s
    comments = comments.replace('\n\n', '<p>')
    # Convert solo returns to <br />
    comments = comments.replace('\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup('<div>' + comments + '</div>').find('div')
    result = BeautifulSoup('<div>')
    container = result.find('div')
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr')
    for token in all_tokens:
        if isinstance(token,
                      (CData, Comment, Declaration, ProcessingInstruction)):
            continue
        if isinstance(token, NavigableString):
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        elif token.name in inline_tags:
            if not open_pTag:
                pTag = result.new_tag('p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                container.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            container.insert(rtc, token)
            rtc += 1

    if open_pTag:
        container.insert(rtc, pTag)

    for p in container.findAll('p'):
        p['class'] = 'description'

    return container.decode_contents()
Exemple #21
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer='',
                    searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
                    )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key+'_label'] = escape(display_name)
            except:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'

    # Fetch preferred merge index technique
    merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash')

    if merge_index == 'hash':
        # Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])

        updates = list(new_hashes.difference(old_hashes))
        if len(updates) and ouas is not None:
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            for new_annotation_id in updates:
                new_annotation = new_soup.find('div', {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup))
            else:
                merged_soup = unicode(sort_merged_annotations(regurgitated_soup))
        else:
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(new_soup)
            else:
                merged_soup = unicode(new_soup)
        return merged_soup

    elif merge_index == 'timestamp':
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            try:
                timestamp = sua.find('td', 'timestamp')['uts']
                timestamps[timestamp] = {'stored_hash': sua['hash']}
            except:
                continue

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            try:
                timestamp = dua.find('td', 'timestamp')['uts']
                if timestamp in timestamps:
                    timestamps[timestamp]['device_hash'] = dua['hash']
                else:
                    timestamps[timestamp] = {'device_hash': dua['hash']}
            except:
                print("ERROR: malformed timestamp in device annotation")
                print(dua.prettify())

        merged_soup = BeautifulSoup(ANNOTATIONS_HEADER)

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_soup.div.append(annotation)

        return unicode(sort_merged_annotations(merged_soup))
Exemple #23
0
def move_annotations(parent,
                     annotation_map,
                     old_destination_field,
                     new_destination_field,
                     window_title=_("Moving annotations")):
    '''
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    '''
    import calibre_plugins.annotations.config as cfg

    _log_location("%s -> %s" % (old_destination_field, new_destination_field))

    library_db = parent.opts.gui.current_db
    id = library_db.FIELD_MAP['id']

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title, on_top=True)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label('{:^100}'.format('%s for %d books' %
                                  (window_title, total_books)))
    pb.show()

    id_map_old_destination_field = {}
    id_map_new_destination_field = {}
    transient_db = 'transient'

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get(
            'COMMENTS_DIVIDER',
            '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'
        ))

    for cid in annotation_map:
        mi = library_db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == 'Comments' and new_destination_field.startswith(
                '#'):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find('div', 'comments_divider')
                    if cd:
                        cd.extract()

                    # Capture content
                    annotation_list = parent.opts.db.capture_content(
                        uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html_from_list(
                        annotation_list)

                    id_map_old_destination_field[cid] = unicode(old_soup)
                    id_map_new_destination_field[cid] = unicode(new_soup)

                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith(
                '#') and new_destination_field == 'Comments':
            if mi.get_user_metadata(old_destination_field,
                                    False)['#value#'] is not None:
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    annotation_list = parent.opts.db.capture_content(
                        uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html_from_list(
                        annotation_list)

                    # Add user_annotations to Comments
                    new_comments = ''
                    if mi.comments is None:
                        new_comments = unicode(new_soup)
                    else:
                        new_comments = mi.comments + \
                                      unicode(comments_divider) + \
                                      unicode(new_soup)

#                     # Update the record with stripped custom field, updated Comments
#                     library_db.set_metadata(cid, mi, set_title=False, set_authors=False,
#                                     commit=True, force_changes=True, notify=True)
                    id_map_old_destination_field[cid] = unicode(old_soup)
                    id_map_new_destination_field[cid] = new_comments
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith(
                '#') and new_destination_field.startswith('#'):

            if mi.get_user_metadata(old_destination_field,
                                    False)['#value#'] is not None:
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    annotation_list = parent.opts.db.capture_content(
                        uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html_from_list(
                        annotation_list)

                    id_map_old_destination_field[cid] = unicode(old_soup)
                    id_map_new_destination_field[cid] = unicode(new_soup)
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif (old_destination_field == new_destination_field):
            pb.set_label('{:^100}'.format(
                _('Updating annotations for {0} books').format(total_books)))

            if new_destination_field == 'Comments':
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find('div', 'user_annotations')
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find('div', 'comments_divider')
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        annotation_list = parent.opts.db.capture_content(
                            uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html_from_list(
                            annotation_list)

                        # Add user_annotations to Comments
                        new_comments = ''
                        if mi.comments is None:
                            new_comments = unicode(new_soup)
                        else:
                            new_comments = mi.comments + \
                                          unicode(comments_divider) + \
                                          unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments


#                         library_db.set_metadata(cid, mi, set_title=False, set_authors=False,
#                                         commit=True, force_changes=True, notify=True)
                        id_map_old_destination_field[cid] = unicode(old_soup)
                        id_map_new_destination_field[cid] = unicode(new_soup)
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    annotation_list = parent.opts.db.capture_content(
                        uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html_from_list(
                        annotation_list)

                    #                     # Add stripped old_soup plus new_soup to destination field
                    #                     um = mi.metadata_for_field(new_destination_field)
                    #                     um['#value#'] = unicode(old_soup) + unicode(new_soup)
                    #                     mi.set_user_metadata(new_destination_field, um)
                    #
                    #                     # Update the record
                    #                     library_db.set_metadata(cid, mi, set_title=False, set_authors=False,
                    #                                     commit=True, force_changes=True, notify=True)
                    id_map_old_destination_field[cid] = unicode(old_soup)
                    id_map_new_destination_field[cid] = unicode(new_soup)
                    pb.increment()

    if len(id_map_old_destination_field) > 0:
        debug_print(
            "move_annotations - Updating metadata - for column: %s number of changes=%d"
            % (old_destination_field, len(id_map_old_destination_field)))
        library_db.new_api.set_field(old_destination_field.lower(),
                                     id_map_old_destination_field)
    if len(id_map_new_destination_field) > 0:
        debug_print(
            "move_annotations - Updating metadata - for column: %s number of changes=%d"
            % (new_destination_field, len(id_map_new_destination_field)))
        library_db.new_api.set_field(new_destination_field.lower(),
                                     id_map_new_destination_field)

    # Hide the progress bar
    pb.hide()

    # Change field value to friendly name
    if old_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if len(annotation_map) == 1:
        book_word = _('book')
    else:
        book_word = _('books')
    if old_destination_field == new_destination_field:
        msg = _(
            "Annotations updated to new appearance settings for {0} {1}.</p>"
        ).format(len(annotation_map), book_word)
    else:
        msg = _("Annotations for {0} {1} moved from <b>{2}</b> to <b>{3}</b>."
                ).format(len(annotation_map), book_word, old_destination_field,
                         new_destination_field)
    msg = "<p>{0}</p>".format(msg)
    MessageBox(MessageBox.INFO,
               '',
               msg=msg,
               show_copy_button=False,
               parent=parent.gui).exec_()
    _log_location()
    _log("INFO: %s" % msg)
Exemple #24
0
    def generate_html(comments):
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_("Published"),
            pubdate=pubdate,
            series_label=_("Series"),
            series=series,
            rating_label=_("Rating"),
            rating=rating,
            tags_label=_("Tags"),
            tags=tags,
            comments=comments,
            footer="",
        )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace("#", "_")
                args[key] = escape(val)
                args[key + "_label"] = escape(display_name)
            except:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith("_") and not key.endswith("_label"):
                    print(" %s: %s" % ("#" + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args["_genre_label"] = args.get("_genre_label", "{_genre_label}")
        args["_genre"] = args.get("_genre", "{_genre}")

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={"class": "cbj_series"})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={"class": "cbj_rating"})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={"class": "cbj_tags"})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={"class": "cbj_pubdata"})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != "kindle":
            hr_tag = soup.find("hr", attrs={"class": "cbj_kindle_banner_hr"})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(soup.renderContents("utf-8").decode("utf-8"))
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field,
                     window_title="Moving annotations"):
    '''
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    '''
    import calibre_plugins.annotations.config as cfg

    _log_location("%s -> %s" % (old_destination_field, new_destination_field))

    db = parent.opts.gui.current_db
    id = db.FIELD_MAP['id']

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title, on_top=True)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label('{:^100}'.format('Moving annotations for %d books' % total_books))
    pb.show()

    transient_db = 'transient'

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get('COMMENTS_DIVIDER', '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'))

    for cid in annotation_map:
        mi = db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == 'Comments' and new_destination_field.startswith('#'):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find('div', 'comments_divider')
                    if cd:
                        cd.extract()

                    # Save stripped Comments
                    mi.comments = unicode(old_soup)

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add user_annotations to destination
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record with stripped Comments, populated custom field
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith('#') and new_destination_field == 'Comments':
            if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add user_annotations to Comments
                    if mi.comments is None:
                        mi.comments = unicode(new_soup)
                    else:
                        mi.comments = mi.comments + \
                                      unicode(comments_divider) + \
                                      unicode(new_soup)

                    # Update the record with stripped custom field, updated Comments
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith('#') and new_destination_field.startswith('#'):

            if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif (old_destination_field == new_destination_field):
            pb.set_label('{:^100}'.format('Updating annotations for %d books' % total_books))

            if new_destination_field == 'Comments':
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find('div', 'user_annotations')
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find('div', 'comments_divider')
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        parent.opts.db.capture_content(uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                        # Add user_annotations to Comments
                        if mi.comments is None:
                            mi.comments = unicode(new_soup)
                        else:
                            mi.comments = mi.comments + \
                                          unicode(comments_divider) + \
                                          unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments
                        db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                        commit=True, force_changes=True, notify=True)
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add stripped old_soup plus new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(old_soup) + unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

    # Hide the progress bar
    pb.hide()

    # Change field value to friendly name
    if old_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if old_destination_field == new_destination_field:
        msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map)
    else:
        msg = ("<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" %
                (len(annotation_map), old_destination_field, new_destination_field))
    if len(annotation_map) == 1:
        msg = msg.format('book')
    else:
        msg = msg.format('books')
    MessageBox(MessageBox.INFO,
               '',
               msg=msg,
               show_copy_button=False,
               parent=parent.gui).exec_()
    _log_location()
    _log("INFO: %s" % msg)

    # Update the UI
    updateCalibreGUIView()
Exemple #26
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer='',
                    searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
                    )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
                elif dt == 'comments':
                    val = val or ''
                    display = m.get('display', {})
                    ctype = display.get('interpret_as') or 'html'
                    if ctype == 'long-text':
                        val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
                    elif ctype == 'short-text':
                        val = '<span>%s</span>' % escape(val)
                    elif ctype == 'markdown':
                        val = markdown(val)
                    else:
                        val = comments_to_html(val)
                    args[dkey] = val
                else:
                    args[dkey] = escape(val)
                args[dkey+'_label'] = escape(display_name)
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
Exemple #27
0
    def preprocess_html(self, soup):
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
                date_str = self.tag_to_string(date_tag,use_alt=False)
                date_str = date_str.replace('Published:','')
                date_items = date_str.split(',')
                try:
                    datestring = date_items[0]+' '+date_items[1]
                    article_date = self.decode_us_date(datestring)
                except:
                    article_date = date.today()
                if article_date < self.earliest_date:
                    self.log("Skipping article dated %s" % date_str)
                    return None

        #all articles are from today, no need to print the date on every page
        try:
            if not self.webEdition:
                date_tag = soup.find(True,attrs={'class': ['dateline','date']})
                if date_tag:
                    date_tag.extract()
        except:
            self.log("Error removing the published date")

        if self.useHighResImages:
            try:
                #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                if enlargeThisList:
                    for popupref in enlargeThisList:
                        popupreflink = popupref.find('a')
                        if popupreflink:
                            reflinkstring = str(popupreflink['href'])
                            refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
                            refend = reflinkstring.find(".html", refstart) + len(".html")
                            reflinkstring = reflinkstring[refstart:refend]

                            popuppage = self.browser.open(reflinkstring)
                            popuphtml = popuppage.read()
                            popuppage.close()
                            if popuphtml:
                                st = time.localtime()
                                year = str(st.tm_year)
                                month = "%.2d" % st.tm_mon
                                day = "%.2d" % st.tm_mday
                                imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
                                highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
                                popupSoup = BeautifulSoup(popuphtml)
                                highResTag = popupSoup.find('img', {'src':highResImageLink})
                                if highResTag:
                                    try:
                                        newWidth = highResTag['width']
                                        newHeight = highResTag['height']
                                        imageTag = popupref.parent.find("img")
                                    except:
                                        self.log("Error: finding width and height of img")
                                    popupref.extract()
                                    if imageTag:
                                        try:
                                            imageTag['src'] = highResImageLink
                                            imageTag['width'] = newWidth
                                            imageTag['height'] = newHeight
                                        except:
                                            self.log("Error setting the src width and height parameters")
            except Exception:
                self.log("Error pulling high resolution images")

            try:
                #remove "Related content" bar
                runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft  ','articleInline runaroundLeft  lastArticleInline']})
                if runAroundsFound:
                    for runAround in runAroundsFound:
                        #find all section headers
                        hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
                        if hlines:
                            for hline in hlines:
                                hline.extract()

                        #find all section headers
                        hlines = runAround.findAll('h6')
                        if hlines:
                            for hline in hlines:
                                hline.extract()
            except:
                self.log("Error removing related content bar")


            try:
                #in case pulling images failed, delete the enlarge this text
                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                if enlargeThisList:
                    for popupref in enlargeThisList:
                        popupref.extract()
            except:
                self.log("Error removing Enlarge this text")

        return self.strip_anchors(soup)
Exemple #28
0
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'
    debug_print("merge_annotations - cid=", cid)
    debug_print("merge_annotations - old_soup=", old_soup)
    debug_print("merge_annotations - new_soup=", new_soup)

    # Fetch preferred merge index technique
    merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash')

    if merge_index == 'hash':
        # Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])
        debug_print("old hashes=", old_hashes)

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            debug_print("Getting old annotations - count=", len(ouas))
            debug_print("Getting old annotations - old_soup=", old_soup)
            debug_print("Getting old annotations - ouas=", ouas)
            ouas.extract()
            debug_print("Getting old annotations - ouas after extract=", ouas)
            debug_print("Getting old annotations - old_soup after extract=", old_soup)

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))
            debug_print("Getting old annotations - regurgitated_soup=", regurgitated_soup)

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])
        debug_print("new_hashes=", sorted(new_hashes))
        debug_print("old hashes=", sorted(old_hashes))
        debug_print("new_hashes.difference(old_hashes)=", new_hashes.difference(old_hashes))

        updates = list(new_hashes.difference(old_hashes))
        debug_print("differences between old and new hashs - updates=", updates)
        if len(updates) and ouas is not None:
            debug_print("have updates and ouas")
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            debug_print("length regurgitated_soup - dtc=", dtc)
            for new_annotation_id in updates:
                debug_print("extending regurgitated_soup - new_annotation_id=", new_annotation_id)
                new_annotation = new_soup.find('div', {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
#             if old_soup:
#                 debug_print("adding old_soup and new_soup")
#                 merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup))
#             else:
#                 debug_print("just new_soup")
            merged_soup = unicode(sort_merged_annotations(regurgitated_soup))
        else:
            debug_print("have updates and ouas")
            if regurgitated_soup:
                debug_print("adding old_soup and new_soup")
                debug_print("unicode(regurgitated_soup)=", unicode(regurgitated_soup))
                debug_print("unicode(new_soup)=", unicode(new_soup))
                merged_soup = unicode(regurgitated_soup)# + unicode(new_soup)
            else:
                debug_print("just new_soup")
                merged_soup = unicode(new_soup)
        debug_print("merged_soup=", merged_soup)
        return merged_soup

    elif merge_index == 'timestamp':
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            try:
                timestamp = sua.find('td', 'timestamp')['uts']
                timestamps[timestamp] = {'stored_hash': sua['hash']}
            except:
                continue

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            try:
                timestamp = dua.find('td', 'timestamp')['uts']
                if timestamp in timestamps:
                    timestamps[timestamp]['device_hash'] = dua['hash']
                else:
                    timestamps[timestamp] = {'device_hash': dua['hash']}
            except:
                print("ERROR: malformed timestamp in device annotation")
                print(dua.prettify())

        merged_soup = BeautifulSoup(ANNOTATIONS_HEADER)

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_soup.div.append(annotation)

        return unicode(sort_merged_annotations(merged_soup))