Ejemplo n.º 1
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>',
                                 re.DOTALL | re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace,
                                                self.verbose,
                                                strip_encoding_pats=True)[0],
                                 markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 2
0
    def postprocess_html(self, soup, first_fetch):
        author_general = soup.find('span', { 'class': 'author_general' })
        author_general.em.extract()
    
        # the complete content
        full_div = None
    
        transcript_div = soup.find('div', { 'id': 'transcript' })
        if transcript_div: # that's an interview
            # get all <div class="qa" />
            qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
            for qa_div in qa_div_list:
                qa_div.extract()
                
                # replace all <a class="question_link">...</a> with <strong>...</strong>
                question_link = qa_div.find('a', { 'class': 'question_link' })
                question_strong = Tag(soup, 'strong')
                question_strong.append(question_link.string)
                question_link.replaceWith(question_strong)
            
            full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next()
            
            # clean the <h1 />
            full_div.h1.span.extract()
            title_div = full_div.h1.div
            title_div.replaceWith(title_div.string)
            
            # clear the presentation area
            for div in full_div.findAll('div'):
                div.extract()
            
            # add qa list back to presentation area
            for qa_div in qa_div_list:
                full_div.append(qa_div)
        else:
            # text only without title
            text_div = find_by_class(soup, 'div', 'text_info').next()
            text_div.extract()
            
            for other in text_div.findAll('div'):
                other.extract()
            
            # full_div contains title
            full_div = soup.find('div', { 'id': 'content' })
            for other in full_div.findAll('div'):
                other.extract()
            
            full_div.append(text_div)

        # keep full_div in <body /> only
        full_div.extract()
        
        for other in soup.body:
            other.extract()
            
        soup.body.append(full_div)

        return soup
Ejemplo n.º 3
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 4
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        set_soup_module(sys.modules[BeautifulSoup.__module__])
        soup = parse(usrc, return_root=False)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = parse(replace, return_root=False)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 5
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 6
0
 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return (html)
Ejemplo n.º 7
0
 def inject_css(self, html):
     '''
     stick a <style> element into html
     Deep View content structured differently
     <html style=""><body style="">
     '''
     css = str(self.css_pte.toPlainText())
     if css:
         raw_soup = self._remove_old_style(html)
         style_tag = Tag(raw_soup, 'style')
         style_tag['type'] = "text/css"
         style_tag.insert(0, css)
         head = raw_soup.find("head")
         head.insert(0, style_tag)
         self.styled_soup = raw_soup
         html = self.styled_soup.renderContents()
     return html
 def inject_css(self, html):
     '''
     stick a <style> element into html
     Deep View content structured differently
     <html style=""><body style="">
     '''
     css = str(self.css_pte.toPlainText())
     if css:
         raw_soup = self._remove_old_style(html)
         style_tag = Tag(raw_soup, 'style')
         style_tag['type'] = "text/css"
         style_tag.insert(0, css)
         head = raw_soup.find("head")
         head.insert(0, style_tag)
         self.styled_soup = raw_soup
         html = self.styled_soup.renderContents()
     return html
Ejemplo n.º 9
0
 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return(html)
Ejemplo n.º 10
0
    def add_annotation_to_library(self, db, db_id, annotation):
        from calibre.ebooks.BeautifulSoup import Tag
        from calibre.ebooks.metadata import MetaInformation

        bm = annotation
        ignore_tags = set(['Catalog', 'Clippings'])

        if bm.type == 'kindle_bookmark':
            mi = db.get_metadata(db_id, index_is_id=True)
            user_notes_soup = self.generate_annotation_html(bm.value)
            if mi.comments:
                a_offset = mi.comments.find('<div class="user_annotations">')
                ad_offset = mi.comments.find(
                    '<hr class="annotations_divider" />')

                if a_offset >= 0:
                    mi.comments = mi.comments[:a_offset]
                if ad_offset >= 0:
                    mi.comments = mi.comments[:ad_offset]
                if set(mi.tags).intersection(ignore_tags):
                    return
                if mi.comments:
                    hrTag = Tag(user_notes_soup, 'hr')
                    hrTag['class'] = 'annotations_divider'
                    user_notes_soup.insert(0, hrTag)

                mi.comments += unicode(user_notes_soup.prettify())
            else:
                mi.comments = unicode(user_notes_soup.prettify())
            # Update library comments
            db.set_comment(db_id, mi.comments)

            # Add bookmark file to db_id
            db.add_format_with_hooks(db_id,
                                     bm.value.bookmark_extension,
                                     bm.value.path,
                                     index_is_id=True)
        elif bm.type == 'kindle_clippings':
            # Find 'My Clippings' author=Kindle in database, or add
            last_update = 'Last modified %s' % strftime(
                u'%x %X', bm.value['timestamp'].timetuple())
            mc_id = list(
                db.data.search_getting_ids('title:"My Clippings"',
                                           '',
                                           sort_results=False))
            if mc_id:
                db.add_format_with_hooks(mc_id[0],
                                         'TXT',
                                         bm.value['path'],
                                         index_is_id=True)
                mi = db.get_metadata(mc_id[0], index_is_id=True)
                mi.comments = last_update
                db.set_metadata(mc_id[0], mi)
            else:
                mi = MetaInformation('My Clippings', authors=['Kindle'])
                mi.tags = ['Clippings']
                mi.comments = last_update
                db.add_books([bm.value['path']], ['txt'], [mi])
Ejemplo n.º 11
0
    def preview_css(self):
        '''
        Construct a dummy set of notes and annotation for preview purposes
        Modeled after book_status:_get_formatted_annotations()
        '''
        from calibre_plugins.marvin_manager.annotations import (
            ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)

        # Assemble the preview soup
        soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)

        # Load the CSS from MXD resources
        path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
        with open(path, 'rb') as f:
            css = f.read().decode('utf-8')
        style_tag = Tag(soup, 'style')
        style_tag.insert(0, css)
        soup.head.style.replaceWith(style_tag)

        # Assemble the sample Book notes
        book_notes_soup = BookNotes().construct(self.sample_book_notes)
        soup.body.append(book_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample Bookmark notes
        bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
        soup.body.append(bookmark_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample annotations
        pas = Annotations(None, title="Preview")
        pas.annotations.append(Annotation(self.sample_ann_1))
        pas.annotations.append(Annotation(self.sample_ann_2))
        pas.annotations.append(Annotation(self.sample_ann_3))
        annotations_soup = pas.to_HTML(pas.create_soup())
        soup.body.append(annotations_soup)

        self.parent.wv.setHtml(unicode(soup.renderContents()))
 def construct(self, book_notes):
     '''
     Given a list of notes, render HTML
     '''
     soup = None
     if book_notes:
         soup = BeautifulSoup(
             '''<div class="{0}"></div>'''.format('book_notes'))
         for note in book_notes:
             div_tag = Tag(soup, 'div', [('class', "book_note")])
             p_tag = Tag(soup, 'p',
                         [('class', "book_note"),
                          ('style', "{0}".format(self._get_note_style()))])
             p_tag.append(note)
             div_tag.append(p_tag)
             soup.div.append(div_tag)
     return soup
Ejemplo n.º 13
0
    def postprocess_html(self, soup, first):
        for rmattrs in ['almenu', 'doc_author_docs', 'doc_print']:
            for item in soup.findAll('div', attrs={'class': rmattrs}):
                item.extract()
        for pz in soup.findAll('p', attrs={'align': 'left'}):
            myp = self.tag_to_string(pz)
            if re.search('^( |&#160;)*$', myp):
                tag = Tag(soup, "div")
                tag['class'] = "removable"
                tag.insert(0, '')
                pz.replaceWith(tag)
            for brz in soup.findAll('br'):
                tag = Tag(soup, "div")
                tag['class'] = "removable"
                tag.insert(0, '')
                brz.replaceWith(tag)

        return soup
Ejemplo n.º 14
0
 def construct(self, book_notes):
     '''
     Given a list of notes, render HTML
     '''
     soup = None
     if book_notes:
         soup = BeautifulSoup('''<div class="{0}"></div>'''.format('book_notes'))
         for note in book_notes:
             div_tag = Tag(soup, 'div', [('class', "book_note")])
             p_tag = Tag(soup, 'p', [('class', "book_note"),
                                     ('style', "{0}".format(self._get_note_style()))])
             p_tag.append(note)
             div_tag.append(p_tag)
             soup.div.append(div_tag)
     return soup
Ejemplo n.º 15
0
    def preview_css(self):
        '''
        Construct a dummy set of notes and annotation for preview purposes
        Modeled after book_status:_get_formatted_annotations()
        '''
        from calibre_plugins.marvin_manager.annotations import (
            ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)

        # Assemble the preview soup
        soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)

        # Load the CSS from MXD resources
        path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
        with open(path, 'rb') as f:
            css = f.read().decode('utf-8')
        style_tag = Tag(soup, 'style')
        style_tag.insert(0, css)
        soup.head.style.replaceWith(style_tag)

        # Assemble the sample Book notes
        book_notes_soup = BookNotes().construct(self.sample_book_notes)
        soup.body.append(book_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample Bookmark notes
        bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
        soup.body.append(bookmark_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample annotations
        pas = Annotations(None, title="Preview")
        pas.annotations.append(Annotation(self.sample_ann_1))
        pas.annotations.append(Annotation(self.sample_ann_2))
        pas.annotations.append(Annotation(self.sample_ann_3))
        annotations_soup = pas.to_HTML(pas.create_soup())
        soup.body.append(annotations_soup)

        self.parent.wv.setHtml(unicode(soup.renderContents()))
Ejemplo n.º 16
0
    def postprocess_html(self,soup, True):

        if self.one_picture_per_article:
            # Remove all images after first
            largeImg = soup.find(True, {'class':'articleSpanImage'})
            inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
            if largeImg:
                for inlineImg in inlineImgs:
                    inlineImg.extract()
            else:
                if inlineImgs:
                    firstImg = inlineImgs[0]
                    for inlineImg in inlineImgs[1:]:
                        inlineImg.extract()
                    # Move firstImg before article body
                    article_body = soup.find(True, {'id':'articleBody'})
                    cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                    if cgFirst:
                        # Strip all sibling NavigableStrings: noise
                        navstrings = cgFirst.findAll(text=True, recursive=False)
                        [ns.extract() for ns in navstrings]
                        headline_found = False
                        tag = cgFirst.find(True)
                        insertLoc = 0
                        while True:
                            insertLoc += 1
                            if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                    headline_found = True
                                    break
                            tag = tag.nextSibling
                            if not tag:
                                headline_found = False
                                break
                        if headline_found:
                            cgFirst.insert(insertLoc,firstImg)
                    else:
                        self.log(">>> No class:'columnGroup first' found <<<")

        # Change captions to italic 
        for caption in soup.findAll(True, {'class':'caption'}) :
            if caption and caption.contents[0]:
                cTag = Tag(soup, "p", [("class", "caption")])
                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
                mp_off = c.find("More Photos")
                if mp_off >= 0:
                    c = c[:mp_off]
                cTag.insert(0, c)
                caption.replaceWith(cTag)

        # Change <nyt_headline> to <h2>
        h1 = soup.find('h1')
        if h1:
            headline = h1.find("nyt_headline")
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                h1.replaceWith(tag)
        else:
            # Blog entry - replace headline, remove <hr> tags
            headline = soup.find('title')
            if headline:
                tag = Tag(soup, "h2")
                tag['class'] = "headline"
                tag.insert(0, self.fixChars(headline.contents[0]))
                soup.insert(0, tag)
                hrs = soup.findAll('hr')
                for hr in hrs:
                    hr.extract()

        # Change <h1> to <h3> - used in editorial blogs
        masthead = soup.find("h1")
        if masthead:
            # Nuke the href
            if masthead.a:
                del(masthead.a['href'])
            tag = Tag(soup, "h3")
            tag.insert(0, self.fixChars(masthead.contents[0]))
            masthead.replaceWith(tag)

        # Change <span class="bold"> to <b>
        for subhead in soup.findAll(True, {'class':'bold'}) :
            if subhead.contents:
                bTag = Tag(soup, "b")
                bTag.insert(0, subhead.contents[0])
                subhead.replaceWith(bTag)

        divTag = soup.find('div',attrs={'id':'articleBody'})
        if divTag:
            divTag['class'] = divTag['id']

        # Add class="authorId" to <div> so we can format with CSS
        divTag = soup.find('div',attrs={'id':'authorId'})
        if divTag and divTag.contents[0]:
            tag = Tag(soup, "p")
            tag['class'] = "authorId"
            tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                             use_alt=False)))
            divTag.replaceWith(tag)

        return soup
Ejemplo n.º 17
0
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'

    if False:
        '''
        Older technique: Use hashes to merge annotations
        '''
        #Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])

        updates = list(new_hashes.difference(old_hashes))
        if len(updates) and ouas is not None:
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            for new_annotation_id in updates:
                new_annotation = new_soup.find('div', {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup))
            else:
                merged_soup = unicode(sort_merged_annotations(regurgitated_soup))
        else:
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(new_soup)
            else:
                merged_soup = unicode(new_soup)
        return merged_soup

    else:
        '''
        Newer technique: Use timestamps to merge annotations
        '''
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            #print("sua: %s" % sua.prettify())
            timestamp = sua.find('td', 'timestamp')['uts']
            timestamps[timestamp] = {'stored_hash': sua['hash']}

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate annotations with current CSS
            rerendered_annotations = parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)
            regurgitated_soup = BeautifulSoup(rerendered_annotations)

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            timestamp = dua.find('td', 'timestamp')['uts']
            if timestamp in timestamps:
                timestamps[timestamp]['device_hash'] = dua['hash']
            else:
                timestamps[timestamp] = {'device_hash': dua['hash']}

        merged_annotations = Tag(BeautifulSoup(), 'div',
            [('class', "user_annotations"), ('style','margin:0')])

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_annotations.append(annotation)

        merged_annotations = sort_merged_annotations(merged_annotations)

        # Update new_soup with merged_annotations
        new_soup_uas = new_soup.find('div', 'user_annotations')
        new_soup_uas.replaceWith(merged_annotations)

        return unicode(new_soup)
Ejemplo n.º 18
0
    def to_HTML(self, header=''):
        '''
        Generate HTML with user-specified CSS, element order
        '''
        # Retrieve CSS prefs
        from calibre_plugins.marvin_manager.appearance import default_elements
        stored_css = plugin_prefs.get('appearance_css', default_elements)

        elements = []
        for element in stored_css:
            elements.append(element['name'])
            if element['name'] == 'Note':
                note_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Text':
                text_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Timestamp':
                ts_style = re.sub('\n', '', element['css'])

        # Additional CSS for timestamp color and bg to be formatted
        datetime_style = ("background-color:{0};color:{1};" + ts_style)

        # Order the elements according to stored preferences
        comments_body = ''
        for element in elements:
            if element == 'Text':
                comments_body += '{text}'
            elif element == 'Note':
                comments_body += '{note}'
            elif element == 'Timestamp':
                ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
                                <tr>
                                    <td class="location" style="text-align:left">{location}</td>
                                    <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
                                </tr>
                            </table>'''
                comments_body += re.sub(r'>\s+<', r'><', ts_css)

        if self.annotations:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
            dtc = 0

            # Add the annotations
            for i, agroup in enumerate(
                    sorted(self.annotations, key=self._annotation_sorter)):
                location = agroup.location
                if location is None:
                    location = ''

                friendly_timestamp = self._timestamp_to_datestr(
                    agroup.timestamp)

                text = ''
                if agroup.text:
                    for agt in agroup.text:
                        text += '<p class="highlight" style="{0}">{1}</p>'.format(
                            text_style, agt)

                note = ''
                if agroup.note:
                    for agn in agroup.note:
                        note += '<p class="note" style="{0}">{1}</p>'.format(
                            note_style, agn)

                try:
                    dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
                    dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
                except:
                    if agroup.highlightcolor is None:
                        msg = "No highlight color specified, using Default"
                    else:
                        msg = "Unknown color '%s' specified" % agroup.highlightcolor
                    self._log_location(msg)
                    dt_bgcolor = COLOR_MAP['Default']['bg']
                    dt_fgcolor = COLOR_MAP['Default']['fg']

                if agroup.hash is not None:
                    # Use existing hash when re-rendering
                    hash = agroup.hash
                else:
                    m = hashlib.md5()
                    m.update(text)
                    m.update(note)
                    hash = m.hexdigest()

                divTag = Tag(BeautifulSoup(), 'div')
                content_args = {
                    'color': agroup.highlightcolor,
                    'friendly_timestamp': friendly_timestamp,
                    'location': location,
                    'note': note,
                    'text': text,
                    'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
                    'unix_timestamp': agroup.timestamp,
                }
                divTag.insert(0, comments_body.format(**content_args))
                divTag['class'] = "annotation"
                divTag['genre'] = ''
                if agroup.genre:
                    divTag['genre'] = escape(agroup.genre)
                divTag['hash'] = hash
                divTag['location_sort'] = agroup.location_sort
                divTag['reader'] = agroup.reader_app
                divTag['style'] = ANNOTATION_DIV_STYLE
                soup.div.insert(dtc, divTag)
                dtc += 1
                if i < len(self.annotations) - 1 and \
                    plugin_prefs.get('appearance_hr_checkbox', False):
                    soup.div.insert(
                        dtc,
                        plugin_prefs.get('HORIZONTAL_RULE',
                                         '<hr width="80%" />'))
                    dtc += 1

        else:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
        return unicode(soup.renderContents())
Ejemplo n.º 19
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)
Ejemplo n.º 20
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)
Ejemplo n.º 21
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup,'div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        spanTag = Tag(ka_soup, 'span')
        spanTag['style'] = 'font-weight:bold'
        if bookmark.book_format == 'pdf':
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))
        else:
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup,'br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % dict(
                                dl=user_notes[location]['displayed_location'],
                                typ=user_notes[location]['type'],
                                text=(user_notes[location]['text'] if
                                      user_notes[location]['type'] == 'Note' else
                                      '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0,divTag)
        return ka_soup
Ejemplo n.º 22
0
    def preprocess_html(self, soup):
        for links in soup.findAll('a'):
            url = links['href']
            if url == '/':
                links.extract()
        for prmattrs in [
                'float: right;margin-left: 5px; margin-bottom: 5px;',
                'doc_tags'
        ]:
            for item in soup.findAll('div', attrs={'style': prmattrs}):
                item.extract()

        mytitle = soup.find('div', attrs={'class': 'doc_title'})
        if mytitle:
            mytitstr = self.tag_to_string(mytitle)
        myauthor = soup.find('div', attrs={'class': 'doc_author'})
        if myauthor:
            myautstr = self.tag_to_string(myauthor)
            myauthor.extract()
            myntitle = myautstr + " - "
            myntitle = myntitle + mytitstr
        else:
            myntitle = mytitle

        tag = Tag(soup, "h2")
        tag['class'] = "headline"
        tag.insert(0, capwords(myntitle))
        mytitle.replaceWith(tag)

        mysubtitle = soup.find('div', attrs={'class': 'doc_subtitle'})
        if mysubtitle:
            mysubtitstr = self.tag_to_string(mysubtitle)
            tag = Tag(soup, "h3")
            tag['class'] = "headline"
            tag.insert(0, capwords(mysubtitstr))
            mysubtitle.replaceWith(tag)

        mylapszam = soup.find('div', attrs={'class': 'lapszam'})
        if mylapszam:
            mylapstr = self.tag_to_string(mylapszam)
            tag = Tag(soup, "h5")
            tag['class'] = "headline"
            tag.insert(0, mylapstr)
            mylapszam.replaceWith(tag)

        return soup
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'

    if False:
        '''
        Older technique: Use hashes to merge annotations
        '''
        #Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(
                parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])

        updates = list(new_hashes.difference(old_hashes))
        if len(updates) and ouas is not None:
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            for new_annotation_id in updates:
                new_annotation = new_soup.find('div',
                                               {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(
                    sort_merged_annotations(regurgitated_soup))
            else:
                merged_soup = unicode(
                    sort_merged_annotations(regurgitated_soup))
        else:
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(new_soup)
            else:
                merged_soup = unicode(new_soup)
        return merged_soup

    else:
        '''
        Newer technique: Use timestamps to merge annotations
        '''
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            #print("sua: %s" % sua.prettify())
            timestamp = sua.find('td', 'timestamp')['uts']
            timestamps[timestamp] = {'stored_hash': sua['hash']}

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate annotations with current CSS
            rerendered_annotations = parent.opts.db.rerender_to_html(
                TRANSIENT_DB, cid)
            regurgitated_soup = BeautifulSoup(rerendered_annotations)

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            timestamp = dua.find('td', 'timestamp')['uts']
            if timestamp in timestamps:
                timestamps[timestamp]['device_hash'] = dua['hash']
            else:
                timestamps[timestamp] = {'device_hash': dua['hash']}

        merged_annotations = Tag(BeautifulSoup(), 'div',
                                 [('class', "user_annotations"),
                                  ('style', 'margin:0')])

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[
                    ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[
                    ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts][
                    'device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_annotations.append(annotation)

        merged_annotations = sort_merged_annotations(merged_annotations)

        # Update new_soup with merged_annotations
        new_soup_uas = new_soup.find('div', 'user_annotations')
        new_soup_uas.replaceWith(merged_annotations)

        return unicode(new_soup)
Ejemplo n.º 24
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString

        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup, "div")
        divTag["class"] = "user_annotations"

        # Add the last-read location
        spanTag = Tag(ka_soup, "span")
        spanTag["style"] = "font-weight:bold"
        if bookmark.book_format == "pdf":
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )
        else:
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup, "br"))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]["text"]:
                    annotations.append(
                        _("<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />")
                        % dict(
                            dl=user_notes[location]["displayed_location"],
                            typ=user_notes[location]["type"],
                            text=(
                                user_notes[location]["text"]
                                if user_notes[location]["type"] == "Note"
                                else "<i>%s</i>" % user_notes[location]["text"]
                            ),
                        )
                    )
                else:
                    if bookmark.book_format == "pdf":
                        annotations.append(
                            _("<b>Page %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )
                    else:
                        annotations.append(
                            _("<b>Location %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup
    def rebuild_collections(self, booklist, oncard):
        '''
        For each book in the booklist for the card oncard, remove it from all
        its current collections, then add it to the collections specified in
        device_collections.

        oncard is None for the main memory, carda for card A, cardb for card B,
        etc.

        booklist is the object created by the :method:`books` call above.

        This is called after the user edits the 'Collections' field in the Device view
        when Metadata management is set to 'Manual'.

        '''
        self._log_location()

        command_name = "rebuild_collections"
        command_element = "rebuildcollections"
        command_soup = BeautifulStoneSoup(self.parent.COMMAND_XML.format(
            command_element, time.mktime(time.localtime())))

        LOCAL_DEBUG = False
        if booklist:
            changed = 0
            for book in booklist:
                if LOCAL_DEBUG:
                    self._log("{0:7} {1}".format(book.in_library, book.title))

                filename = self.parent.path_template.format(book.uuid)
                if filename not in self.parent.cached_books:
                    for fn in self.parent.cached_books:
                        if book.uuid and book.uuid == self.parent.cached_books[fn]['uuid']:
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on uuid %s" % (book.title, book.uuid))
                            filename = fn
                            break
                        elif (book.title == self.parent.cached_books[fn]['title'] and
                              book.authors == self.parent.cached_books[fn]['authors']):
                            if LOCAL_DEBUG:
                                self._log("'%s' matched on title/author" % book.title)
                            filename = fn
                            break
                    else:
                        self._log("ERROR: file %s not found in cached_books" % repr(filename))
                        continue

                cached_collections = self.parent.cached_books[filename]['device_collections']
                if cached_collections != book.device_collections:
                    # Append the changed book info to the command file
                    book_tag = Tag(command_soup, 'book')
                    book_tag['filename'] = filename
                    book_tag['title'] = book.title
                    book_tag['author'] = ', '.join(book.authors)
                    book_tag['uuid'] = book.uuid

                    collections_tag = Tag(command_soup, 'collections')
                    for tag in book.device_collections:
                        c_tag = Tag(command_soup, 'collection')
                        c_tag.insert(0, tag)
                        collections_tag.insert(0, c_tag)
                    book_tag.insert(0, collections_tag)

                    command_soup.manifest.insert(0, book_tag)

                    # Update cache
                    self.parent.cached_books[filename]['device_collections'] = book.device_collections

                    changed += 1

            if changed:
                # Stage the command file
                self.parent._stage_command_file(command_name, command_soup,
                                                show_command=self.parent.prefs.get('development_mode', False))

                # Wait for completion
                self.parent._wait_for_command_completion(command_name)
            else:
                self._log("no collection changes detected cached_books <=> device books")
Ejemplo n.º 26
0
def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)
Ejemplo n.º 27
0
    def postprocess_html(self, soup, first_fetch):
        author_general = soup.find('span', {'class': 'author_general'})
        author_general.em.extract()

        # the complete content
        full_div = None

        transcript_div = soup.find('div', {'id': 'transcript'})
        if transcript_div:  # that's an interview
            # get all <div class="qa" />
            qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
            for qa_div in qa_div_list:
                qa_div.extract()

                # replace all <a class="question_link">...</a> with <strong>...</strong>
                question_link = qa_div.find('a', {'class': 'question_link'})
                question_strong = Tag(soup, 'strong')
                question_strong.append(question_link.string)
                question_link.replaceWith(question_strong)

            full_div = find_by_class(soup.find('div', {'id': 'content'}),
                                     'div', 'presentation_full').next()

            # clean the <h1 />
            full_div.h1.span.extract()
            title_div = full_div.h1.div
            title_div.replaceWith(title_div.string)

            # clear the presentation area
            for div in full_div.findAll('div'):
                div.extract()

            # add qa list back to presentation area
            for qa_div in qa_div_list:
                full_div.append(qa_div)
        else:
            # text only without title
            text_div = find_by_class(soup, 'div', 'text_info').next()
            text_div.extract()

            for other in text_div.findAll('div'):
                other.extract()

            # full_div contains title
            full_div = soup.find('div', {'id': 'content'})
            for other in full_div.findAll('div'):
                other.extract()

            full_div.append(text_div)

        full_div.extract()

        nav_div = soup.body.div
        nav_div.extract()

        # keep nav_div and full_div in <body /> only
        for other in soup.body:
            other.extract()

        soup.body.append(nav_div)
        soup.body.append(full_div)

        return soup
Ejemplo n.º 28
0
def comments_to_html(comments):
    """
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    """
    if not comments:
        return u"<p></p>"
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, "replace")

    if comments.lstrip().startswith("<"):
        # Comment is already HTML do not mess with it
        return comments

    if "<" not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")]
        return "\n".join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback

            traceback.print_exc()
            return u"<p></p>"

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(
            lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))
        )

    comments = comments.replace(u"\r", u"")
    # Convert \n\n to <p>s
    comments = comments.replace(u"\n\n", u"<p>")
    # Convert solo returns to <br />
    comments = comments.replace(u"\n", "<br />")
    # Convert two hyphens to emdash
    comments = comments.replace("--", "&mdash;")

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration, ProcessingInstruction):
            continue
        elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]:
            if not open_pTag:
                pTag = Tag(result, "p")
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll("p"):
        p["class"] = "description"

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)
Ejemplo n.º 29
0
    def to_HTML(self, header=''):
        '''
        Generate HTML with user-specified CSS, element order
        '''
        # Retrieve CSS prefs
        from calibre_plugins.annotations.appearance import default_elements
        stored_css = plugin_prefs.get('appearance_css', default_elements)

        elements = []
        for element in stored_css:
            elements.append(element['name'])
            if element['name'] == 'Note':
                note_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Text':
                text_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Timestamp':
                ts_style = re.sub('\n', '', element['css'])

        # Additional CSS for timestamp color and bg to be formatted
        datetime_style = ("background-color:{0};color:{1};" + ts_style)

        # Order the elements according to stored preferences
        comments_body = ''
        for element in elements:
            if element == 'Text':
                comments_body += '{text}'
            elif element == 'Note':
                comments_body += '{note}'
            elif element == 'Timestamp':
                ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
                                <tr>
                                    <td class="location" style="text-align:left">{location}</td>
                                    <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
                                </tr>
                            </table>'''
                comments_body += re.sub(r'>\s+<', r'><', ts_css)

        if self.annotations:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
            dtc = 0

            # Add the annotations
            for i, agroup in enumerate(sorted(self.annotations, key=self._annotation_sorter)):
                location = agroup.location
                if location is None:
                    location = ''

                friendly_timestamp = self._timestamp_to_datestr(agroup.timestamp)

                text = ''
                if agroup.text:
                    for agt in agroup.text:
                        text += '<p class="highlight" style="{0}">{1}</p>'.format(text_style, agt)

                note = ''
                if agroup.note:
                    for agn in agroup.note:
                        note += '<p class="note" style="{0}">{1}</p>'.format(note_style, agn)

                try:
                    dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
                    dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
                except:
                    if agroup.highlightcolor is None:
                        msg = "No highlight color specified, using Default"
                    else:
                        msg = "Unknown color '%s' specified" % agroup.highlightcolor
                    self._log_location(msg)
                    dt_bgcolor = COLOR_MAP['Default']['bg']
                    dt_fgcolor = COLOR_MAP['Default']['fg']

                if agroup.hash is not None:
                    # Use existing hash when re-rendering
                    hash = agroup.hash
                else:
                    m = hashlib.md5()
                    m.update(text)
                    m.update(note)
                    hash = m.hexdigest()

                divTag = Tag(BeautifulSoup(), 'div')
                content_args = {
                            'color': agroup.highlightcolor,
                            'friendly_timestamp': friendly_timestamp,
                            'location': location,
                            'note': note,
                            'text': text,
                            'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
                            'unix_timestamp': agroup.timestamp,
                            }
                divTag.insert(0, comments_body.format(**content_args))
                divTag['class'] = "annotation"
                divTag['genre'] = ''
                if agroup.genre:
                    divTag['genre'] = escape(agroup.genre)
                divTag['hash'] = hash
                divTag['location_sort'] = agroup.location_sort
                divTag['reader'] = agroup.reader_app
                divTag['style'] = ANNOTATION_DIV_STYLE
                soup.div.insert(dtc, divTag)
                dtc += 1
                if i < len(self.annotations) - 1 and \
                    plugin_prefs.get('appearance_hr_checkbox', False):
                    soup.div.insert(dtc, plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />'))
                    dtc += 1

        else:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
        return unicode(soup.renderContents())
Ejemplo n.º 30
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup, 'div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        spanTag = Tag(ka_soup, 'span')
        spanTag['style'] = 'font-weight:bold'
        if bookmark.book_format == 'pdf':
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % \
                            dict(time=strftime(u'%x', timestamp.timetuple()),
                            loc=last_read_location,
                            pr=percent_read)))
        else:
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % \
                            dict(time=strftime(u'%x', timestamp.timetuple()),
                            loc=last_read_location,
                            pr=percent_read)))

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup, 'br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % \
                                        dict(dl=user_notes[location]['displayed_location'],
                                            typ=user_notes[location]['type'],
                                            text=(user_notes[location]['text'] if \
                                            user_notes[location]['type'] == 'Note' else \
                                            '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % \
                                    dict(dl=user_notes[location]['displayed_location'],
                                        typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % \
                                    dict(dl=user_notes[location]['displayed_location'],
                                        typ=user_notes[location]['type']))

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup
Ejemplo n.º 31
0
    def postprocess_html(self,soup, True):
        try:
                if self.one_picture_per_article:
                        # Remove all images after first
                        largeImg = soup.find(True, {'class':'articleSpanImage'})
                        inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
                        if largeImg:
                                for inlineImg in inlineImgs:
                                        inlineImg.extract()
                        else:
                                if inlineImgs:
                                        firstImg = inlineImgs[0]
                                        for inlineImg in inlineImgs[1:]:
                                                inlineImg.extract()
                                        # Move firstImg before article body
                                        cgFirst = soup.find(True, {'class':re.compile('columnGroup  *first')})
                                        if cgFirst:
                                                # Strip all sibling NavigableStrings: noise
                                                navstrings = cgFirst.findAll(text=True, recursive=False)
                                                [ns.extract() for ns in navstrings]
                                                headline_found = False
                                                tag = cgFirst.find(True)
                                                insertLoc = 0
                                                while True:
                                                        insertLoc += 1
                                                        if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
                                                                        headline_found = True
                                                                        break
                                                        tag = tag.nextSibling
                                                        if not tag:
                                                                headline_found = False
                                                                break
                                                if headline_found:
                                                        cgFirst.insert(insertLoc,firstImg)
                                        else:
                                                self.log(">>> No class:'columnGroup first' found <<<")
        except:
                self.log("ERROR: One picture per article in postprocess_html")

        try:
                # Change captions to italic
                for caption in soup.findAll(True, {'class':'caption'}) :
                        if caption and len(caption) > 0:
                                cTag = Tag(soup, "p", [("class", "caption")])
                                c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
                                mp_off = c.find("More Photos")
                                if mp_off >= 0:
                                        c = c[:mp_off]
                                cTag.insert(0, c)
                                caption.replaceWith(cTag)
        except:
                self.log("ERROR:  Problem in change captions to italic")

        try:
                # Change <nyt_headline> to <h2>
                h1 = soup.find('h1')
                blogheadline = str(h1) #added for dealbook
                if h1:
                        headline = h1.find("nyt_headline")
                        if headline:
                                tag = Tag(soup, "h2")
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.contents[0]))
                                h1.replaceWith(tag)
                        elif blogheadline.find('entry-title'):#added for dealbook
                                tag = Tag(soup, "h2")#added for dealbook
                                tag['class'] = "headline"#added for dealbook
                                tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook
                                h1.replaceWith(tag)#added for dealbook

                else:
                        # Blog entry - replace headline, remove <hr> tags  - BCC I think this is no longer functional 1-18-2011
                        headline = soup.find('title')
                        if headline:
                                tag = Tag(soup, "h2")
                                tag['class'] = "headline"
                                tag.insert(0, self.fixChars(headline.renderContents()))
                                soup.insert(0, tag)
                                hrs = soup.findAll('hr')
                                for hr in hrs:
                                        hr.extract()
        except:
                self.log("ERROR:  Problem in Change <nyt_headline> to <h2>")

        try:
                #if this is from a blog (dealbook, fix the byline format
                bylineauthor = soup.find('address',attrs={'class':'byline author vcard'})
                if bylineauthor:
                    tag = Tag(soup, "h6")
                    tag['class'] = "byline"
                    tag.insert(0, self.fixChars(bylineauthor.renderContents()))
                    bylineauthor.replaceWith(tag)
        except:
            self.log("ERROR:  fixing byline author format")

        try:
                #if this is a blog (dealbook) fix the credit style for the pictures
                blogcredit = soup.find('div',attrs={'class':'credit'})
                if blogcredit:
                    tag = Tag(soup, "h6")
                    tag['class'] = "credit"
                    tag.insert(0, self.fixChars(blogcredit.renderContents()))
                    blogcredit.replaceWith(tag)
        except:
            self.log("ERROR:  fixing credit format")


        try:
                # Change <h1> to <h3> - used in editorial blogs
                masthead = soup.find("h1")
                if masthead:
                        # Nuke the href
                        if masthead.a:
                                del(masthead.a['href'])
                        tag = Tag(soup, "h3")
                        tag.insert(0, self.fixChars(masthead.contents[0]))
                        masthead.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")

        try:
                # Change <span class="bold"> to <b>
                for subhead in soup.findAll(True, {'class':'bold'}) :
                        if subhead.contents:
                                bTag = Tag(soup, "b")
                                bTag.insert(0, subhead.contents[0])
                                subhead.replaceWith(bTag)
        except:
                self.log("ERROR:  Problem in Change <h1> to <h3> - used in editorial blogs")
        try:
                #remove the <strong> update tag
                blogupdated = soup.find('span', {'class':'update'})
                if blogupdated:
                    blogupdated.replaceWith("")
        except:
                self.log("ERROR:  Removing strong tag")

        try:
                divTag = soup.find('div',attrs={'id':'articleBody'})
                if divTag:
                        divTag['class'] = divTag['id']
        except:
                self.log("ERROR:  Problem in soup.find(div,attrs={id:articleBody})")

        try:
                # Add class="authorId" to <div> so we can format with CSS
                divTag = soup.find('div',attrs={'id':'authorId'})
                if divTag and divTag.contents[0]:
                        tag = Tag(soup, "p")
                        tag['class'] = "authorId"
                        tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
                                                         use_alt=False)))
                        divTag.replaceWith(tag)
        except:
                self.log("ERROR:  Problem in Add class=authorId to <div> so we can format with CSS")

        return soup