def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response and 'Did you mean:' not in response and
                'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None
    def download_html(target_url):
        soup = ""
        html_page = ""
        html_raw = ""
        try:
            #~ connection = urllib2.urlopen(target_url)
            connection = myurlopen(target_url)
            html_raw = connection.read()
            soup = BeautifulSoup(html_raw)
            if hasattr(soup, "findAll"):
                if DEBUG:
                    print("BeautifulSoup3 is being used in download_html")
            elif hasattr(soup, "find_all"):
                if DEBUG:
                    print("BeautifulSoup4 is being used in download_html")
            else:
                if DEBUG:
                    print("BeautifulSoup???? is being used in download_html")
            connection.close()
            del connection
            html_page = soup.prettify()  # take note...
        except Exception as e:
            if DEBUG: print(unicode_type(e))
            pass

        if not soup:
            if DEBUG: print("not soup")
            soup = ""
        if not html_page:
            html_page = ""
        if not html_raw:
            html_raw = ""

        return html_page, soup, html_raw
Ejemplo n.º 3
0
    def get_annotations_date_range(self):
        '''
        Find oldest, newest annotation in annotated books
        initial values of self.oldest, self.newest are reversed to allow update comparisons
        if no annotations, restore to correct values
        '''
        annotations_found = False

        for cid in self.annotation_map:
            mi = self.cdb.get_metadata(cid, index_is_id=True)
            if self.field == 'Comments':
                soup = BeautifulSoup(mi.comments)
            else:
                soup = BeautifulSoup(
                    mi.get_user_metadata(self.field, False)['#value#'])

            uas = soup.findAll('div', 'annotation')
            for ua in uas:
                annotations_found = True
                timestamp = float(ua.find('td', 'timestamp')['uts'])
                if timestamp < self.oldest_annotation:
                    self.oldest_annotation = timestamp
                if timestamp > self.newest_annotation:
                    self.newest_annotation = timestamp

        if not annotations_found:
            temp = self.newest_annotation
            self.newest_annotation = self.oldest_annotation
            self.oldest_annotation = temp
Ejemplo n.º 4
0
def existing_annotations(parent, field, return_all=False):
    '''
    Return count of existing annotations, or existence of any
    '''
    import calibre_plugins.annotations.config as cfg
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP['id']
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup.find('div', 'user_annotations') is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log_location("Identified %d annotated books of %d total books" %
                (len(annotation_map), len(db.data)))
        return annotation_map
Ejemplo n.º 5
0
def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup

    soup = BeautifulSoup(raw)
    matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"])
    images = soup.findAll("img")
    if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover":
        img = images[0]
        img = os.path.join(base, *img["src"].split("/"))
        if os.path.exists(img):
            return open(img, "rb").read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find("body")
        if body is not None:
            text = u"".join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll("img", src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]["src"].split("/"))
                if os.path.exists(img):
                    return open(img, "rb").read()
Ejemplo n.º 6
0
    def get_annotations_date_range(self):
        '''
        Find oldest, newest annotation in annotated books
        initial values of self.oldest, self.newest are reversed to allow update comparisons
        if no annotations, restore to correct values
        '''
        annotations_found = False

        for cid in self.annotation_map:
            mi = self.cdb.get_metadata(cid, index_is_id=True)
            if self.field == 'Comments':
                soup = BeautifulSoup(mi.comments)
            else:
                soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])

            uas = soup.findAll('div', 'annotation')
            for ua in uas:
                annotations_found = True
                timestamp = float(ua.find('td', 'timestamp')['uts'])
                if timestamp < self.oldest_annotation:
                    self.oldest_annotation = timestamp
                if timestamp > self.newest_annotation:
                    self.newest_annotation = timestamp

        if not annotations_found:
            temp = self.newest_annotation
            self.newest_annotation = self.oldest_annotation
            self.oldest_annotation = temp
Ejemplo n.º 7
0
def save_soup(soup, target):
    ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path):
                tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 8
0
def existing_annotations(parent, field, return_all=False):
    """
    Return count of existing annotations, or existence of any
    """
    # import calibre_plugins.marvin_manager.config as cfg
    _log_location(field)
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP["id"]
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == "Comments":
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"])
            if soup.find("div", "user_annotations") is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data)))

        _log("annotation_map: %s" % repr(annotation_map))
    else:
        _log("no active field")

    return annotation_map
Ejemplo n.º 9
0
def save_soup(soup, target):
    ns = BeautifulSoup(
        '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
    )
    nm = ns.find('meta')
    metas = soup.findAll('meta', content=True)
    added = False
    for meta in metas:
        if 'charset' in meta.get('content', '').lower():
            meta.replaceWith(nm)
            added = True
    if not added:
        head = soup.find('head')
        if head is not None:
            head.insert(0, nm)

    selfdir = os.path.dirname(target)

    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.isfile(path) and os.path.exists(
                    path) and os.path.isabs(path):
                tag[key] = unicode_path(
                    relpath(path, selfdir).replace(os.sep, '/'))

    html = unicode_type(soup)
    with open(target, 'wb') as f:
        f.write(html.encode('utf-8'))
Ejemplo n.º 10
0
    def find_all_annotated_books(self):
        '''
        Find all annotated books in library
        '''
        if not self.field:
            self._log_location()
            self._log("No custom column field specified, cannot find annotated books")
            return
        if not (self.field in self.cdb.custom_field_keys() or self.field == 'Comments'):
            self._log_location()
            self._log("No custom column field specified, cannot find annotated books")
            return

        id = self.cdb.FIELD_MAP['id']
        for record in self.cdb.data.iterall():
            mi = self.cdb.get_metadata(record[id], index_is_id=True)
            if self.field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#'])

            if soup.find('div', 'user_annotations') is not None:
                self.annotation_map.append(mi.id)
Ejemplo n.º 11
0
def existing_annotations(parent, field, return_all=False):
    '''
    Return count of existing annotations, or existence of any
    '''
    #import calibre_plugins.marvin_manager.config as cfg
    _log_location(field)
    annotation_map = []
    if field:
        db = parent.opts.gui.current_db
        id = db.FIELD_MAP['id']
        for i, record in enumerate(db.data.iterall()):
            mi = db.get_metadata(record[id], index_is_id=True)
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
                else:
                    continue
            else:
                soup = BeautifulSoup(
                    mi.get_user_metadata(field, False)['#value#'])
            if soup.find('div', 'user_annotations') is not None:
                annotation_map.append(mi.id)
                if not return_all:
                    break
        if return_all:
            _log("Identified %d annotated books of %d total books" %
                 (len(annotation_map), len(db.data)))

        _log("annotation_map: %s" % repr(annotation_map))
    else:
        _log("no active field")

    return annotation_map
Ejemplo n.º 12
0
def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()
Ejemplo n.º 13
0
def extract_calibre_cover(raw, base, log):
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(raw)
    matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
        'font', 'br'])
    images = soup.findAll('img')
    if matches is None and len(images) == 1 and \
            images[0].get('alt', '')=='cover':
        img = images[0]
        img = os.path.join(base, *img['src'].split('/'))
        if os.path.exists(img):
            return open(img, 'rb').read()

    # Look for a simple cover, i.e. a body with no text and only one <img> tag
    if matches is None:
        body = soup.find('body')
        if body is not None:
            text = u''.join(map(unicode, body.findAll(text=True)))
            if text.strip():
                # Body has text, abort
                return
            images = body.findAll('img', src=True)
            if 0 < len(images) < 2:
                img = os.path.join(base, *images[0]['src'].split('/'))
                if os.path.exists(img):
                    return open(img, 'rb').read()
Ejemplo n.º 14
0
    def search_for_asin_on_amazon(self, query):
        '''Search for book's asin on amazon using given query'''
        query = urlencode({'keywords': query})
        url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[
            9:] + '&' + query
        try:
            response = open_url(self._connections['amazon'], url)
        except PageDoesNotExist:
            return None

        # check to make sure there are results
        if ('did not match any products' in response
                and 'Did you mean:' not in response
                and 'so we searched in All Departments' not in response):
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'class': 's-result-list'})

        if not results:
            return None

        for result in results:
            if 'Buy now with 1-Click' in str(result):
                asin_search = AMAZON_ASIN_PAT.search(str(result))
                if asin_search:
                    return asin_search.group(1)

        return None
Ejemplo n.º 15
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class': 'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class': 'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
Ejemplo n.º 16
0
def get_series(title, authors, timeout=60):
    mi = Metadata(title, authors)
    if title and title[0] in _ignore_starts:
        title = title[1:]
    title = re.sub(r'^(A|The|An)\s+', '', title).strip()
    if not title:
        return mi
    if isinstance(title, unicode):
        title = title.encode('utf-8')

    title = urllib.quote_plus(title)

    author = authors[0].strip()
    if not author:
        return mi
    if ',' in author:
        author = author.split(',')[0]
    else:
        author = author.split()[-1]

    url = URL.format(author, title)
    br = browser()
    try:
        raw = br.open_novisit(url, timeout=timeout).read()
    except URLError as e:
        if isinstance(e.reason, socket.timeout):
            raise Exception('KDL Server busy, try again later')
        raise
    if 'see the full results' not in raw:
        return mi
    raw = xml_to_unicode(raw)[0]
    soup = BeautifulSoup(raw)
    searcharea = soup.find('div', attrs={'class':'searcharea'})
    if searcharea is None:
        return mi
    ss = searcharea.find('div', attrs={'class':'seriessearch'})
    if ss is None:
        return mi
    a = ss.find('a', href=True)
    if a is None:
        return mi
    href = a['href'].partition('?')[-1]
    data = urlparse.parse_qs(href)
    series = data.get('SeriesName', [])
    if not series:
        return mi
    series = series[0]
    series = re.sub(r' series$', '', series).strip()
    if series:
        mi.series = series
    ns = ss.nextSibling
    if ns.contents:
        raw = unicode(ns.contents[0])
        raw = raw.partition('.')[0].strip()
        try:
            mi.series_index = int(raw)
        except:
            pass
    return mi
Ejemplo n.º 17
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>',
                                 re.DOTALL | re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace,
                                                self.verbose,
                                                strip_encoding_pats=True)[0],
                                 markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 18
0
 def find_all_annotated_books(self):
     '''
     Find all annotated books in library
     '''
     self._log_location("field: {0}".format(self.field))
     cids = self.cdb.search_getting_ids('formats:EPUB', '')
     for cid in cids:
         mi = self.cdb.get_metadata(cid, index_is_id=True)
         raw = mi.get_user_metadata(self.field, False)
         if raw['#value#'] is not None:
             soup = BeautifulSoup(raw['#value#'])
             if soup.find('div', 'user_annotations') is not None:
                 self.annotation_map.append(mi.id)
Ejemplo n.º 19
0
 def find_all_annotated_books(self):
     '''
     Find all annotated books in library
     '''
     self._log_location("field: {0}".format(self.field))
     cids = self.cdb.search_getting_ids('formats:EPUB', '')
     for cid in cids:
         mi = self.cdb.get_metadata(cid, index_is_id=True)
         raw = mi.get_user_metadata(self.field, False)
         if raw['#value#'] is not None:
             soup = BeautifulSoup(raw['#value#'])
             if soup.find('div', 'user_annotations') is not None:
                 self.annotation_map.append(mi.id)
Ejemplo n.º 20
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 21
0
    def get_asin(self, connection):
        query = urlencode({'keywords': '%s - %s' % (self._title, self._author)})
        try:
            connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
            response = connection.getresponse().read()
        except:
            try:
                connection.close()
                if self._proxy:
                    connection = HTTPConnection(self._http_address, self._http_port)
                    connection.set_tunnel('www.amazon.com', 80)
                else:
                    connection = HTTPConnection('www.amazon.com')

                connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
                response = connection.getresponse().read()
            except:
                self._status = self.FAIL
                self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON
                raise Exception(self._status_message)

        # check to make sure there are results
        if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})
       
        if not results or len(results) == 0:
            self._status = self.FAIL
            self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE
            raise Exception(self._status_message)

        for r in results:
            if 'Buy now with 1-Click' in str(r):
                asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
                if asinSearch:
                    self._asin = asinSearch.group(1)
                    mi = self._db.get_metadata(self._book_id)
                    identifiers = mi.get_identifiers()
                    identifiers['mobi-asin'] = self._asin
                    mi.set_identifiers(identifiers)
                    self._db.set_metadata(self._book_id, mi)
                    self._book_settings.prefs['asin'] = self._asin
                    return connection

        self._status = self.FAIL
        self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN
        raise Exception(self._status_message)
Ejemplo n.º 22
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'),
                    pubdate=pubdate,
                    series_label=_('Series'),
                    series=series,
                    rating_label=_('Rating'),
                    rating=rating,
                    tags_label=_('Tags'),
                    tags=tags,
                    comments=comments,
                    footer='')
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key + '_label'] = escape(display_name)
            except:
                pass

        # Used in the comment describing use of custom columns in templates
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        generated_html = P('jacket/template.xhtml',
                           data=True).decode('utf-8').format(**args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class': 'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class': 'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class': 'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
            soup.renderContents('utf-8').decode('utf-8'))
Ejemplo n.º 23
0
def get_metadata_from_reader(rdr):
    raw = rdr.GetFile(rdr.home)
    home = BeautifulSoup(
        xml_to_unicode(raw, strip_encoding_pats=True,
                       resolve_entities=True)[0])

    title = rdr.title
    try:
        x = rdr.GetEncoding()
        codecs.lookup(x)
        enc = x
    except:
        enc = 'cp1252'
    title = force_unicode(title, enc)
    authors = _get_authors(home)
    mi = MetaInformation(title, authors)
    publisher = _get_publisher(home)
    if publisher:
        mi.publisher = publisher
    isbn = _get_isbn(home)
    if isbn:
        mi.isbn = isbn
    comments = _get_comments(home)
    if comments:
        mi.comments = comments

    cdata = _get_cover(home, rdr)
    if cdata is not None:
        mi.cover_data = ('jpg', cdata)

    return mi
 def construct(self, bookmark_notes):
     '''
     bookmark_notes: {loc_sort: {color, location, note}…}
     Optionally include <hr> between booknotes
     '''
     soup = None
     if bookmark_notes:
         soup = BeautifulSoup(
             '''<div class="{0}"></div>'''.format('bookmark_notes'))
         dtc = 0
         for i, location_sort in enumerate(sorted(bookmark_notes.keys())):
             soup.div.insert(
                 dtc,
                 self.BOOKMARK_TEMPLATE.format(
                     location_sort, bookmark_notes[location_sort]['color'],
                     self._get_style('Location'),
                     bookmark_notes[location_sort]['location'],
                     self._get_style('Note'),
                     bookmark_notes[location_sort]['note']))
             dtc += 1
             if (i < len(bookmark_notes) - 1
                     and plugin_prefs.get('appearance_hr_checkbox', False)):
                 soup.div.insert(
                     dtc,
                     plugin_prefs.get('HORIZONTAL_RULE',
                                      '<hr width="80%" />'))
                 dtc += 1
     return soup
Ejemplo n.º 25
0
 def _remove_old_style(self, html):
     '''
     Remove the old style tag, finalize soup in preparation for styling
     '''
     unstyled_soup = BeautifulSoup(html)
     head = unstyled_soup.find("head")
     voc = unstyled_soup.body.find('div', {'class': 'vocabulary'})
     tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a)
     dart = random.randrange(len(tds))
     self.td = tds[dart]
     self.oh = self.td.a['href']
     self.td.a['href'] = self._finalize()
     old_style = head.find('style')
     if old_style:
         old_style.extract()
     return unstyled_soup
Ejemplo n.º 26
0
 def _remove_old_style(self, html):
     '''
     Remove the old style tag, finalize soup in preparation for styling
     '''
     unstyled_soup = BeautifulSoup(html)
     head = unstyled_soup.find("head")
     voc = unstyled_soup.body.find('div', {'class': 'vocabulary'})
     tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a)
     dart = random.randrange(len(tds))
     self.td = tds[dart]
     self.oh = self.td.a['href']
     self.td.a['href'] = self._finalize()
     old_style = head.find('style')
     if old_style:
         old_style.extract()
     return unstyled_soup
Ejemplo n.º 27
0
 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return (html)
Ejemplo n.º 28
0
 def _inject_css(self, html):
     '''
     stick a <style> element into html
     '''
     css = self.prefs.get('injected_css', None)
     if css:
         try:
             styled_soup = BeautifulSoup(html)
             head = styled_soup.find("head")
             style_tag = Tag(styled_soup, 'style')
             style_tag['type'] = "text/css"
             style_tag.insert(0, css)
             head.insert(0, style_tag)
             html = styled_soup.renderContents()
         except:
             return html
     return(html)
Ejemplo n.º 29
0
    def get_soup(self, src, url=None):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
        # Some websites have buggy doctype declarations that mess up beautifulsoup
        nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')]
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        soup = BeautifulSoup(usrc, markupMassage=nmassage)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage)

        if self.keep_only_tags:
            body = Tag(soup, 'body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
Ejemplo n.º 30
0
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup):
    '''
    comments_soup: comments potentially with user_annotations
    '''

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        plugin_prefs.get(
            'COMMENTS_DIVIDER',
            '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'
        ))

    # Remove the old comments_divider
    cds = comments_soup.find('div', 'comments_divider')
    if cds:
        cds.extract()

    # Existing annotations?
    uas = comments_soup.find('div', 'user_annotations')
    if uas:
        # Save the existing annotations to old_soup
        old_soup = BeautifulSoup(unicode(uas))

        # Remove any hrs from old_soup
        hrs = old_soup.findAll('hr')
        if hrs:
            for hr in hrs:
                hr.extract()

        # Remove the existing annotations from comments_soup
        uas.extract()

        # Merge old_soup with new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(merge_annotations(parent, cid, old_soup, new_soup))
    else:
        # No existing, just merge comments_soup with already sorted new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(new_soup)

    return merged_soup
Ejemplo n.º 31
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer=''
                    )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key+'_label'] = escape(display_name)
            except:
                pass

        # Used in the comment describing use of custom columns in templates
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        generated_html = P('jacket/template.xhtml',
                data=True).decode('utf-8').format(**args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
Ejemplo n.º 32
0
def sort_merged_annotations(merged_soup):
    '''
    Input: a combined group of user annotations
    Output: sorted by location
    '''
    include_hr = plugin_prefs.get('appearance_hr_checkbox', False)
    locations = merged_soup.findAll(location_sort=True)
    locs = [loc['location_sort'] for loc in locations]
    locs.sort()

    sorted_soup = BeautifulSoup(ANNOTATIONS_HEADER)
    dtc = 0
    for i, loc in enumerate(locs):
        next_div = merged_soup.find(attrs={'location_sort': loc})
        sorted_soup.div.insert(dtc, next_div)
        dtc += 1
        if include_hr and i < len(locs) - 1:
            sorted_soup.div.insert(dtc, BeautifulSoup(plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />')))
            dtc += 1

    return sorted_soup
Ejemplo n.º 33
0
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup):
    '''
    comments_soup: comments potentially with user_annotations
    '''

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        plugin_prefs.get('COMMENTS_DIVIDER', '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'))

    # Remove the old comments_divider
    cds = comments_soup.find('div', 'comments_divider')
    if cds:
        cds.extract()

    # Existing annotations?
    uas = comments_soup.find('div', 'user_annotations')
    if uas:
        # Save the existing annotations to old_soup
        old_soup = BeautifulSoup(unicode(uas))

        # Remove any hrs from old_soup
        hrs = old_soup.findAll('hr')
        if hrs:
            for hr in hrs:
                hr.extract()

        # Remove the existing annotations from comments_soup
        uas.extract()

        # Merge old_soup with new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(merge_annotations(parent, cid, old_soup, new_soup))
    else:
        # No existing, just merge comments_soup with already sorted new_soup
        merged_soup = unicode(comments_soup) + \
                      unicode(comments_divider) + \
                      unicode(new_soup)

    return merged_soup
Ejemplo n.º 34
0
    def preview_css(self):
        '''
        Construct a dummy set of notes and annotation for preview purposes
        Modeled after book_status:_get_formatted_annotations()
        '''
        from calibre_plugins.marvin_manager.annotations import (
            ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)

        # Assemble the preview soup
        soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)

        # Load the CSS from MXD resources
        path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
        with open(path, 'rb') as f:
            css = f.read().decode('utf-8')
        style_tag = Tag(soup, 'style')
        style_tag.insert(0, css)
        soup.head.style.replaceWith(style_tag)

        # Assemble the sample Book notes
        book_notes_soup = BookNotes().construct(self.sample_book_notes)
        soup.body.append(book_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample Bookmark notes
        bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
        soup.body.append(bookmark_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample annotations
        pas = Annotations(None, title="Preview")
        pas.annotations.append(Annotation(self.sample_ann_1))
        pas.annotations.append(Annotation(self.sample_ann_2))
        pas.annotations.append(Annotation(self.sample_ann_3))
        annotations_soup = pas.to_HTML(pas.create_soup())
        soup.body.append(annotations_soup)

        self.parent.wv.setHtml(unicode(soup.renderContents()))
    def get_asin(self):
        query = urlencode({'keywords': '%s' % self.title_and_author})
        try:
            self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
            response = self._aConnection.getresponse().read()
        except:
            try:
                self._aConnection.close()
                if self._proxy:
                    self._aConnection = HTTPConnection(self._http_address, self._http_port)
                    self._aConnection.set_tunnel('www.amazon.com', 80)
                else:
                    self._aConnection = HTTPConnection('www.amazon.com')

                self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS)
                response = self._aConnection.getresponse().read()
            except:
                return None

        # check to make sure there are results
        if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response:
            return None

        soup = BeautifulSoup(response)
        results = soup.findAll('div', {'id': 'resultsCol'})
       
        if not results or len(results) == 0:
            return None

        for r in results:
            if 'Buy now with 1-Click' in str(r):
                asinSearch = self.AMAZON_ASIN_PAT.search(str(r))
                if asinSearch:
                    asin = asinSearch.group(1)
                    mi = self._db.get_metadata(self._book_id)
                    identifiers = mi.get_identifiers()
                    identifiers['mobi-asin'] = asin
                    mi.set_identifiers(identifiers)
                    self._db.set_metadata(self._book_id, mi)
                    return asin
Ejemplo n.º 36
0
    def preview_css(self):
        '''
        Construct a dummy set of notes and annotation for preview purposes
        Modeled after book_status:_get_formatted_annotations()
        '''
        from calibre_plugins.marvin_manager.annotations import (
            ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes)

        # Assemble the preview soup
        soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE)

        # Load the CSS from MXD resources
        path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css')
        with open(path, 'rb') as f:
            css = f.read().decode('utf-8')
        style_tag = Tag(soup, 'style')
        style_tag.insert(0, css)
        soup.head.style.replaceWith(style_tag)

        # Assemble the sample Book notes
        book_notes_soup = BookNotes().construct(self.sample_book_notes)
        soup.body.append(book_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample Bookmark notes
        bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes)
        soup.body.append(bookmark_notes_soup)
        cd_tag = Tag(soup, 'div', [('class', "divider")])
        soup.body.append(cd_tag)

        # Assemble the sample annotations
        pas = Annotations(None, title="Preview")
        pas.annotations.append(Annotation(self.sample_ann_1))
        pas.annotations.append(Annotation(self.sample_ann_2))
        pas.annotations.append(Annotation(self.sample_ann_3))
        annotations_soup = pas.to_HTML(pas.create_soup())
        soup.body.append(annotations_soup)

        self.parent.wv.setHtml(unicode(soup.renderContents()))
Ejemplo n.º 37
0
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, "rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll("a"):
            if not a.has_key("href"):
                continue
            purl = urlparse(unquote(a["href"]))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = "".join([unicode(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)
Ejemplo n.º 38
0
    def read_html_toc(self, toc):
        self.base_path = os.path.dirname(toc)
        soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
        for a in soup.findAll('a'):
            if not a.has_key('href'):
                continue
            purl = urlparse(unquote(a['href']))
            href, fragment = purl[2], purl[5]
            if not fragment:
                fragment = None
            else:
                fragment = fragment.strip()
            href = href.strip()

            txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)])
            add = True
            for i in self.flat():
                if i.href == href and i.fragment == fragment:
                    add = False
                    break
            if add:
                self.add_item(href, fragment, txt)
 def construct(self, book_notes):
     '''
     Given a list of notes, render HTML
     '''
     soup = None
     if book_notes:
         soup = BeautifulSoup(
             '''<div class="{0}"></div>'''.format('book_notes'))
         for note in book_notes:
             div_tag = Tag(soup, 'div', [('class', "book_note")])
             p_tag = Tag(soup, 'p',
                         [('class', "book_note"),
                          ('style', "{0}".format(self._get_note_style()))])
             p_tag.append(note)
             div_tag.append(p_tag)
             soup.div.append(div_tag)
     return soup
Ejemplo n.º 40
0
    def process_articles(self, title, article, baseurl, into_dir='links'):
          res = ''
          diskpath = os.path.join(self.current_dir, into_dir)
          '''
          必须添加Elemnt 的 Class ,否则框架会自动添加
          '''
          html = "<html><head><title>" + title + "</title><style type='text/css'>" + self.extra_css + "</style></head><body><div class='posts'>"
          for a in article:
              if self.show_progress:
                  print '.',
                  sys.stdout.flush()
              sys.stdout.flush()
#              self.log("article:"+str(a))
              html += "<div class='post'><div class='post-frame'><img src='" + a['href'] + "' class='post-img'></img><span>"+a['tags']+"</span></div>"
              html += "</div>"
          html += "</div></body></html>"
          soup = BeautifulSoup(html)
          self.log.debug('Processing images...')
          try:
              self.process_images(soup, baseurl)
          except Exception:
              self.lof('Exception')
          finally:    
             self.log('end processing images')
          _fname = title
          if not isinstance(_fname, unicode):
              _fname.decode('latin1', 'replace')
          _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '')
          _fname = ascii_filename(_fname)
          _fname = os.path.splitext(_fname)[0] + '.xhtml'
          res = os.path.join(diskpath, _fname)
          self.downloaded_paths.append(res)
          nurl = baseurl + title
          self.filemap[nurl] = res      
          save_soup(soup, res)  
          self.downloaded_paths.append(res)
          return res 
Ejemplo n.º 41
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup, 'div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        spanTag = Tag(ka_soup, 'span')
        spanTag['style'] = 'font-weight:bold'
        if bookmark.book_format == 'pdf':
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % \
                            dict(time=strftime(u'%x', timestamp.timetuple()),
                            loc=last_read_location,
                            pr=percent_read)))
        else:
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % \
                            dict(time=strftime(u'%x', timestamp.timetuple()),
                            loc=last_read_location,
                            pr=percent_read)))

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup, 'br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % \
                                        dict(dl=user_notes[location]['displayed_location'],
                                            typ=user_notes[location]['type'],
                                            text=(user_notes[location]['text'] if \
                                            user_notes[location]['type'] == 'Note' else \
                                            '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % \
                                    dict(dl=user_notes[location]['displayed_location'],
                                        typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % \
                                    dict(dl=user_notes[location]['displayed_location'],
                                        typ=user_notes[location]['type']))

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup
Ejemplo n.º 42
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup,'div')
        divTag['class'] = 'user_annotations'

        # Add the last-read location
        spanTag = Tag(ka_soup, 'span')
        spanTag['style'] = 'font-weight:bold'
        if bookmark.book_format == 'pdf':
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))
        else:
            spanTag.insert(0,NavigableString(
                _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict(
                    time=strftime(u'%x', timestamp.timetuple()),
                    loc=last_read_location,
                    pr=percent_read)))

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup,'br'))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]['text']:
                    annotations.append(
                            _('<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />') % dict(
                                dl=user_notes[location]['displayed_location'],
                                typ=user_notes[location]['type'],
                                text=(user_notes[location]['text'] if
                                      user_notes[location]['type'] == 'Note' else
                                      '<i>%s</i>' % user_notes[location]['text'])))
                else:
                    if bookmark.book_format == 'pdf':
                        annotations.append(
                                _('<b>Page %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))
                    else:
                        annotations.append(
                                _('<b>Location %(dl)d &bull; %(typ)s</b><br />') % dict(
                                    dl=user_notes[location]['displayed_location'],
                                    typ=user_notes[location]['type']))

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0,divTag)
        return ka_soup
Ejemplo n.º 43
0
    def generate_annotation_html(self, bookmark):
        from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString

        # Returns <div class="user_annotations"> ... </div>
        last_read_location = bookmark.last_read_location
        timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp)
        percent_read = bookmark.percent_read

        ka_soup = BeautifulSoup()
        dtc = 0
        divTag = Tag(ka_soup, "div")
        divTag["class"] = "user_annotations"

        # Add the last-read location
        spanTag = Tag(ka_soup, "span")
        spanTag["style"] = "font-weight:bold"
        if bookmark.book_format == "pdf":
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )
        else:
            spanTag.insert(
                0,
                NavigableString(
                    _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)")
                    % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read)
                ),
            )

        divTag.insert(dtc, spanTag)
        dtc += 1
        divTag.insert(dtc, Tag(ka_soup, "br"))
        dtc += 1

        if bookmark.user_notes:
            user_notes = bookmark.user_notes
            annotations = []

            # Add the annotations sorted by location
            # Italicize highlighted text
            for location in sorted(user_notes):
                if user_notes[location]["text"]:
                    annotations.append(
                        _("<b>Location %(dl)d &bull; %(typ)s</b><br />%(text)s<br />")
                        % dict(
                            dl=user_notes[location]["displayed_location"],
                            typ=user_notes[location]["type"],
                            text=(
                                user_notes[location]["text"]
                                if user_notes[location]["type"] == "Note"
                                else "<i>%s</i>" % user_notes[location]["text"]
                            ),
                        )
                    )
                else:
                    if bookmark.book_format == "pdf":
                        annotations.append(
                            _("<b>Page %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )
                    else:
                        annotations.append(
                            _("<b>Location %(dl)d &bull; %(typ)s</b><br />")
                            % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"])
                        )

            for annotation in annotations:
                divTag.insert(dtc, annotation)
                dtc += 1

        ka_soup.insert(0, divTag)
        return ka_soup
Ejemplo n.º 44
0
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'

    if False:
        '''
        Older technique: Use hashes to merge annotations
        '''
        #Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(
                parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])

        updates = list(new_hashes.difference(old_hashes))
        if len(updates) and ouas is not None:
            # Append new to regurgitated
            dtc = len(regurgitated_soup.div)
            for new_annotation_id in updates:
                new_annotation = new_soup.find('div',
                                               {'hash': new_annotation_id})
                regurgitated_soup.div.insert(dtc, new_annotation)
                dtc += 1
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(
                    sort_merged_annotations(regurgitated_soup))
            else:
                merged_soup = unicode(
                    sort_merged_annotations(regurgitated_soup))
        else:
            if old_soup:
                merged_soup = unicode(old_soup) + unicode(new_soup)
            else:
                merged_soup = unicode(new_soup)
        return merged_soup

    else:
        '''
        Newer technique: Use timestamps to merge annotations
        '''
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            #print("sua: %s" % sua.prettify())
            timestamp = sua.find('td', 'timestamp')['uts']
            timestamps[timestamp] = {'stored_hash': sua['hash']}

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(
                parent.opts.db.rerender_to_html(TRANSIENT_DB, cid))

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            timestamp = dua.find('td', 'timestamp')['uts']
            if timestamp in timestamps:
                timestamps[timestamp]['device_hash'] = dua['hash']
            else:
                timestamps[timestamp] = {'device_hash': dua['hash']}

        merged_soup = BeautifulSoup(ANNOTATIONS_HEADER)

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[
                    ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[
                    ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts][
                    'device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_soup.div.append(annotation)

        return unicode(sort_merged_annotations(merged_soup))
Ejemplo n.º 45
0
    def _reformat(self, data, htmlpath):
        if self.input_encoding:
            data = data.decode(self.input_encoding)
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
        except ValueError:
            # hit some strange encoding problems...
            self.log.exception("Unable to parse html for cleaning, leaving it")
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # See if everything is inside a <head> tag
        # https://bugs.launchpad.net/bugs/1273512
        body = soup.find('body')
        if body is not None and body.parent.name == 'head':
            html = soup.find('html')
            html.insert(len(html), body)

        # remove forward and back nav bars from the top/bottom of each page
        # cos they really f**k with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        # only remove the tables, if they have an image with an alt attribute
        # containing prev, next or team
        t = soup('table')
        if t:
            if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None):
                try:
                    alt = t[0].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
                        t[0].extract()
                except:
                    pass
            if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None):
                try:
                    alt = t[-1].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1:
                        t[-1].extract()
                except:
                    pass
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()

        # some images seem to be broken in some chm's :/
        base = os.path.dirname(htmlpath)
        for img in soup('img', src=True):
            src = img['src']
            ipath = os.path.join(base, *src.split('/'))
            if os.path.exists(ipath):
                continue
            src = src.split(';')[0]
            if not src:
                continue
            ipath = os.path.join(base, *src.split('/'))
            if not os.path.exists(ipath):
                while src.startswith('../'):
                    src = src[3:]
            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element
            tables = soup.body.findAll('table', recursive=False)
            if tables and len(tables) == 1:
                trs = tables[0].findAll('tr', recursive=False)
                if trs and len(trs) == 1:
                    tds = trs[0].findAll('td', recursive=False)
                    if tds and len(tds) == 1:
                        tdContents = tds[0].contents
                        tableIdx = soup.body.contents.index(tables[0])
                        tables[0].extract()
                        while tdContents:
                            soup.body.insert(tableIdx, tdContents.pop())
        except:
            pass
        # do not prettify, it would reformat the <pre> tags!
        try:
            ans = str(soup)
            self.re_encoded_files.add(os.path.abspath(htmlpath))
            return ans
        except RuntimeError:
            return data
Ejemplo n.º 46
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)
Ejemplo n.º 47
0
    def to_HTML(self, header=''):
        '''
        Generate HTML with user-specified CSS, element order
        '''
        # Retrieve CSS prefs
        from calibre_plugins.marvin_manager.appearance import default_elements
        stored_css = plugin_prefs.get('appearance_css', default_elements)

        elements = []
        for element in stored_css:
            elements.append(element['name'])
            if element['name'] == 'Note':
                note_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Text':
                text_style = re.sub('\n', '', element['css'])
            elif element['name'] == 'Timestamp':
                ts_style = re.sub('\n', '', element['css'])

        # Additional CSS for timestamp color and bg to be formatted
        datetime_style = ("background-color:{0};color:{1};" + ts_style)

        # Order the elements according to stored preferences
        comments_body = ''
        for element in elements:
            if element == 'Text':
                comments_body += '{text}'
            elif element == 'Note':
                comments_body += '{note}'
            elif element == 'Timestamp':
                ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}">
                                <tr>
                                    <td class="location" style="text-align:left">{location}</td>
                                    <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td>
                                </tr>
                            </table>'''
                comments_body += re.sub(r'>\s+<', r'><', ts_css)

        if self.annotations:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
            dtc = 0

            # Add the annotations
            for i, agroup in enumerate(
                    sorted(self.annotations, key=self._annotation_sorter)):
                location = agroup.location
                if location is None:
                    location = ''

                friendly_timestamp = self._timestamp_to_datestr(
                    agroup.timestamp)

                text = ''
                if agroup.text:
                    for agt in agroup.text:
                        text += '<p class="highlight" style="{0}">{1}</p>'.format(
                            text_style, agt)

                note = ''
                if agroup.note:
                    for agn in agroup.note:
                        note += '<p class="note" style="{0}">{1}</p>'.format(
                            note_style, agn)

                try:
                    dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg']
                    dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg']
                except:
                    if agroup.highlightcolor is None:
                        msg = "No highlight color specified, using Default"
                    else:
                        msg = "Unknown color '%s' specified" % agroup.highlightcolor
                    self._log_location(msg)
                    dt_bgcolor = COLOR_MAP['Default']['bg']
                    dt_fgcolor = COLOR_MAP['Default']['fg']

                if agroup.hash is not None:
                    # Use existing hash when re-rendering
                    hash = agroup.hash
                else:
                    m = hashlib.md5()
                    m.update(text)
                    m.update(note)
                    hash = m.hexdigest()

                divTag = Tag(BeautifulSoup(), 'div')
                content_args = {
                    'color': agroup.highlightcolor,
                    'friendly_timestamp': friendly_timestamp,
                    'location': location,
                    'note': note,
                    'text': text,
                    'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor),
                    'unix_timestamp': agroup.timestamp,
                }
                divTag.insert(0, comments_body.format(**content_args))
                divTag['class'] = "annotation"
                divTag['genre'] = ''
                if agroup.genre:
                    divTag['genre'] = escape(agroup.genre)
                divTag['hash'] = hash
                divTag['location_sort'] = agroup.location_sort
                divTag['reader'] = agroup.reader_app
                divTag['style'] = ANNOTATION_DIV_STYLE
                soup.div.insert(dtc, divTag)
                dtc += 1
                if i < len(self.annotations) - 1 and \
                    plugin_prefs.get('appearance_hr_checkbox', False):
                    soup.div.insert(
                        dtc,
                        plugin_prefs.get('HORIZONTAL_RULE',
                                         '<hr width="80%" />'))
                    dtc += 1

        else:
            soup = BeautifulSoup(ANNOTATIONS_HEADER)
        return unicode(soup.renderContents())
Ejemplo n.º 48
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer='',
                    searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
                    )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace('#', '_')
                args[key] = escape(val)
                args[key+'_label'] = escape(display_name)
            except:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))
Ejemplo n.º 49
0
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field,
                     window_title="Moving annotations"):
    '''
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    '''
    import calibre_plugins.annotations.config as cfg

    _log_location("%s -> %s" % (old_destination_field, new_destination_field))

    db = parent.opts.gui.current_db
    id = db.FIELD_MAP['id']

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title, on_top=True)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label('{:^100}'.format('Moving annotations for %d books' % total_books))
    pb.show()

    transient_db = 'transient'

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get('COMMENTS_DIVIDER', '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'))

    for cid in annotation_map:
        mi = db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == 'Comments' and new_destination_field.startswith('#'):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find('div', 'comments_divider')
                    if cd:
                        cd.extract()

                    # Save stripped Comments
                    mi.comments = unicode(old_soup)

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add user_annotations to destination
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record with stripped Comments, populated custom field
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith('#') and new_destination_field == 'Comments':
            if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add user_annotations to Comments
                    if mi.comments is None:
                        mi.comments = unicode(new_soup)
                    else:
                        mi.comments = mi.comments + \
                                      unicode(comments_divider) + \
                                      unicode(new_soup)

                    # Update the record with stripped custom field, updated Comments
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith('#') and new_destination_field.startswith('#'):

            if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif (old_destination_field == new_destination_field):
            pb.set_label('{:^100}'.format('Updating annotations for %d books' % total_books))

            if new_destination_field == 'Comments':
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find('div', 'user_annotations')
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find('div', 'comments_divider')
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        parent.opts.db.capture_content(uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                        # Add user_annotations to Comments
                        if mi.comments is None:
                            mi.comments = unicode(new_soup)
                        else:
                            mi.comments = mi.comments + \
                                          unicode(comments_divider) + \
                                          unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments
                        db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                        commit=True, force_changes=True, notify=True)
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add stripped old_soup plus new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(old_soup) + unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid, mi, set_title=False, set_authors=False,
                                    commit=True, force_changes=True, notify=True)
                    pb.increment()

    # Hide the progress bar
    pb.hide()

    # Change field value to friendly name
    if old_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith('#'):
        for cf in parent.custom_fields:
            if parent.custom_fields[cf]['field'] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if old_destination_field == new_destination_field:
        msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map)
    else:
        msg = ("<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" %
                (len(annotation_map), old_destination_field, new_destination_field))
    if len(annotation_map) == 1:
        msg = msg.format('book')
    else:
        msg = msg.format('books')
    MessageBox(MessageBox.INFO,
               '',
               msg=msg,
               show_copy_button=False,
               parent=parent.gui).exec_()
    _log_location()
    _log("INFO: %s" % msg)

    # Update the UI
    updateCalibreGUIView()
Ejemplo n.º 50
0
def move_annotations(
    parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations"
):
    """
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    """
    import calibre_plugins.marvin_manager.config as cfg

    _log_location(annotation_map)
    _log(" %s -> %s" % (old_destination_field, new_destination_field))

    db = parent.opts.gui.current_db
    id = db.FIELD_MAP["id"]

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label("{:^100}".format("Moving annotations for %d books" % total_books))
    pb.show()

    transient_db = "transient"

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get(
            "COMMENTS_DIVIDER", "&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;"
        )
    )

    for cid in annotation_map:
        mi = db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == "Comments" and new_destination_field.startswith("#"):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find("div", "comments_divider")
                    if cd:
                        cd.extract()

                    # Save stripped Comments
                    mi.comments = unicode(old_soup)

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add user_annotations to destination
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record with stripped Comments, populated custom field
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith("#") and new_destination_field == "Comments":
            if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um["#value#"] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add user_annotations to Comments
                    if mi.comments is None:
                        mi.comments = unicode(new_soup)
                    else:
                        mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup)

                    # Update the record with stripped custom field, updated Comments
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith("#") and new_destination_field.startswith("#"):

            if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None:
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um["#value#"] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif old_destination_field == new_destination_field:
            pb.set_label("{:^100}".format("Updating annotations for %d books" % total_books))

            if new_destination_field == "Comments":
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find("div", "user_annotations")
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find("div", "comments_divider")
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        parent.opts.db.capture_content(uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                        # Add user_annotations to Comments
                        if mi.comments is None:
                            mi.comments = unicode(new_soup)
                        else:
                            mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments
                        db.set_metadata(
                            cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                        )
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"])
                uas = old_soup.find("div", "user_annotations")
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(transient_db, cid)

                    # Add stripped old_soup plus new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um["#value#"] = unicode(old_soup) + unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(
                        cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True
                    )
                    pb.increment()

    # Hide the progress bar
    pb.hide()

    # Get the eligible custom fields
    all_custom_fields = db.custom_field_keys()
    custom_fields = {}
    for cf in all_custom_fields:
        field_md = db.metadata_for_field(cf)
        if field_md["datatype"] in ["comments"]:
            custom_fields[field_md["name"]] = {"field": cf, "datatype": field_md["datatype"]}

    # Change field value to friendly name
    if old_destination_field.startswith("#"):
        for cf in custom_fields:
            if custom_fields[cf]["field"] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith("#"):
        for cf in custom_fields:
            if custom_fields[cf]["field"] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if old_destination_field == new_destination_field:
        msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map)
    else:
        msg = "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % (
            len(annotation_map),
            old_destination_field,
            new_destination_field,
        )
    if len(annotation_map) == 1:
        msg = msg.format("book")
    else:
        msg = msg.format("books")
    MessageBox(MessageBox.INFO, "", msg=msg, show_copy_button=False, parent=parent.gui).exec_()
    _log("INFO: %s" % msg)

    # Update the UI
    updateCalibreGUIView()
Ejemplo n.º 51
0
def move_annotations(parent,
                     annotation_map,
                     old_destination_field,
                     new_destination_field,
                     window_title="Moving annotations"):
    '''
    Move annotations from old_destination_field to new_destination_field
    annotation_map precalculated in thread in config.py
    '''
    import calibre_plugins.marvin_manager.config as cfg

    _log_location(annotation_map)
    _log(" %s -> %s" % (old_destination_field, new_destination_field))

    db = parent.opts.gui.current_db
    id = db.FIELD_MAP['id']

    # Show progress
    pb = ProgressBar(parent=parent, window_title=window_title)
    total_books = len(annotation_map)
    pb.set_maximum(total_books)
    pb.set_value(1)
    pb.set_label('{:^100}'.format('Moving annotations for %d books' %
                                  total_books))
    pb.show()

    transient_db = 'transient'

    # Prepare a new COMMENTS_DIVIDER
    comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format(
        cfg.plugin_prefs.get(
            'COMMENTS_DIVIDER',
            '&middot;  &middot;  &bull;  &middot;  &#x2726;  &middot;  &bull;  &middot; &middot;'
        ))

    for cid in annotation_map:
        mi = db.get_metadata(cid, index_is_id=True)

        # Comments -> custom
        if old_destination_field == 'Comments' and new_destination_field.startswith(
                '#'):
            if mi.comments:
                old_soup = BeautifulSoup(mi.comments)
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from Comments
                    uas.extract()

                    # Remove comments_divider from Comments
                    cd = old_soup.find('div', 'comments_divider')
                    if cd:
                        cd.extract()

                    # Save stripped Comments
                    mi.comments = unicode(old_soup)

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(
                        transient_db, cid)

                    # Add user_annotations to destination
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record with stripped Comments, populated custom field
                    db.set_metadata(cid,
                                    mi,
                                    set_title=False,
                                    set_authors=False,
                                    commit=True,
                                    force_changes=True,
                                    notify=True)
                    pb.increment()

        # custom -> Comments
        elif old_destination_field.startswith(
                '#') and new_destination_field == 'Comments':
            if mi.get_user_metadata(old_destination_field,
                                    False)['#value#'] is not None:
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(
                        transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add user_annotations to Comments
                    if mi.comments is None:
                        mi.comments = unicode(new_soup)
                    else:
                        mi.comments = mi.comments + \
                                      unicode(comments_divider) + \
                                      unicode(new_soup)

                    # Update the record with stripped custom field, updated Comments
                    db.set_metadata(cid,
                                    mi,
                                    set_title=False,
                                    set_authors=False,
                                    commit=True,
                                    force_changes=True,
                                    notify=True)
                    pb.increment()

        # custom -> custom
        elif old_destination_field.startswith(
                '#') and new_destination_field.startswith('#'):

            if mi.get_user_metadata(old_destination_field,
                                    False)['#value#'] is not None:
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(
                        transient_db, cid)

                    # Save stripped custom field data
                    um = mi.metadata_for_field(old_destination_field)
                    um['#value#'] = unicode(old_soup)
                    mi.set_user_metadata(old_destination_field, um)

                    # Add new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid,
                                    mi,
                                    set_title=False,
                                    set_authors=False,
                                    commit=True,
                                    force_changes=True,
                                    notify=True)
                    pb.increment()

        # same field -> same field - called from config:configure_appearance()
        elif (old_destination_field == new_destination_field):
            pb.set_label('{:^100}'.format('Updating annotations for %d books' %
                                          total_books))

            if new_destination_field == 'Comments':
                if mi.comments:
                    old_soup = BeautifulSoup(mi.comments)
                    uas = old_soup.find('div', 'user_annotations')
                    if uas:
                        # Remove user_annotations from Comments
                        uas.extract()

                        # Remove comments_divider from Comments
                        cd = old_soup.find('div', 'comments_divider')
                        if cd:
                            cd.extract()

                        # Save stripped Comments
                        mi.comments = unicode(old_soup)

                        # Capture content
                        parent.opts.db.capture_content(uas, cid, transient_db)

                        # Regurgitate content with current CSS style
                        new_soup = parent.opts.db.rerender_to_html(
                            transient_db, cid)

                        # Add user_annotations to Comments
                        if mi.comments is None:
                            mi.comments = unicode(new_soup)
                        else:
                            mi.comments = mi.comments + \
                                          unicode(comments_divider) + \
                                          unicode(new_soup)

                        # Update the record with stripped custom field, updated Comments
                        db.set_metadata(cid,
                                        mi,
                                        set_title=False,
                                        set_authors=False,
                                        commit=True,
                                        force_changes=True,
                                        notify=True)
                        pb.increment()

            else:
                # Update custom field
                old_soup = BeautifulSoup(
                    mi.get_user_metadata(old_destination_field,
                                         False)['#value#'])
                uas = old_soup.find('div', 'user_annotations')
                if uas:
                    # Remove user_annotations from originating custom field
                    uas.extract()

                    # Capture content
                    parent.opts.db.capture_content(uas, cid, transient_db)

                    # Regurgitate content with current CSS style
                    new_soup = parent.opts.db.rerender_to_html(
                        transient_db, cid)

                    # Add stripped old_soup plus new_soup to destination field
                    um = mi.metadata_for_field(new_destination_field)
                    um['#value#'] = unicode(old_soup) + unicode(new_soup)
                    mi.set_user_metadata(new_destination_field, um)

                    # Update the record
                    db.set_metadata(cid,
                                    mi,
                                    set_title=False,
                                    set_authors=False,
                                    commit=True,
                                    force_changes=True,
                                    notify=True)
                    pb.increment()

    # Hide the progress bar
    pb.hide()

    # Get the eligible custom fields
    all_custom_fields = db.custom_field_keys()
    custom_fields = {}
    for cf in all_custom_fields:
        field_md = db.metadata_for_field(cf)
        if field_md['datatype'] in ['comments']:
            custom_fields[field_md['name']] = {
                'field': cf,
                'datatype': field_md['datatype']
            }

    # Change field value to friendly name
    if old_destination_field.startswith('#'):
        for cf in custom_fields:
            if custom_fields[cf]['field'] == old_destination_field:
                old_destination_field = cf
                break
    if new_destination_field.startswith('#'):
        for cf in custom_fields:
            if custom_fields[cf]['field'] == new_destination_field:
                new_destination_field = cf
                break

    # Report what happened
    if old_destination_field == new_destination_field:
        msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(
            annotation_map)
    else:
        msg = (
            "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>"
            % (len(annotation_map), old_destination_field,
               new_destination_field))
    if len(annotation_map) == 1:
        msg = msg.format('book')
    else:
        msg = msg.format('books')
    MessageBox(MessageBox.INFO,
               '',
               msg=msg,
               show_copy_button=False,
               parent=parent.gui).exec_()
    _log("INFO: %s" % msg)

    # Update the UI
    updateCalibreGUIView()
Ejemplo n.º 52
0
    def generate_html(comments):
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_("Published"),
            pubdate=pubdate,
            series_label=_("Series"),
            series=series,
            rating_label=_("Rating"),
            rating=rating,
            tags_label=_("Tags"),
            tags=tags,
            comments=comments,
            footer="",
        )
        for key in mi.custom_field_keys():
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                key = key.replace("#", "_")
                args[key] = escape(val)
                args[key + "_label"] = escape(display_name)
            except:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith("_") and not key.endswith("_label"):
                    print(" %s: %s" % ("#" + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args["_genre_label"] = args.get("_genre_label", "{_genre_label}")
        args["_genre"] = args.get("_genre", "{_genre}")

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={"class": "cbj_series"})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={"class": "cbj_rating"})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={"class": "cbj_tags"})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={"class": "cbj_pubdata"})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != "kindle":
            hr_tag = soup.find("hr", attrs={"class": "cbj_kindle_banner_hr"})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(soup.renderContents("utf-8").decode("utf-8"))
Ejemplo n.º 53
0
    def _reformat(self, data, htmlpath):
        if self.input_encoding:
            data = data.decode(self.input_encoding)
        try:
            data = xml_to_unicode(data, strip_encoding_pats=True)[0]
            soup = BeautifulSoup(data)
        except ValueError:
            # hit some strange encoding problems...
            self.log.exception("Unable to parse html for cleaning, leaving it")
            return data
        # nuke javascript...
        [s.extract() for s in soup('script')]
        # See if everything is inside a <head> tag
        # https://bugs.launchpad.net/bugs/1273512
        body = soup.find('body')
        if body is not None and body.parent.name == 'head':
            html = soup.find('html')
            html.insert(len(html), body)

        # remove forward and back nav bars from the top/bottom of each page
        # cos they really f**k with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        # only remove the tables, if they have an image with an alt attribute
        # containing prev, next or team
        t = soup('table')
        if t:
            if (t[0].previousSibling is None
                    or t[0].previousSibling.previousSibling is None):
                try:
                    alt = t[0].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find(
                            'next') != -1 or alt.find('team') != -1:
                        t[0].extract()
                except:
                    pass
            if (t[-1].nextSibling is None
                    or t[-1].nextSibling.nextSibling is None):
                try:
                    alt = t[-1].img['alt'].lower()
                    if alt.find('prev') != -1 or alt.find(
                            'next') != -1 or alt.find('team') != -1:
                        t[-1].extract()
                except:
                    pass
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # remove br at top of page if present after nav bars removed
        br = soup('br')
        if br:
            if check_all_prev_empty(br[0].previousSibling):
                br[0].extract()

        # some images seem to be broken in some chm's :/
        base = os.path.dirname(htmlpath)
        for img in soup('img', src=True):
            src = img['src']
            ipath = os.path.join(base, *src.split('/'))
            if os.path.exists(ipath):
                continue
            src = src.split(';')[0]
            if not src:
                continue
            ipath = os.path.join(base, *src.split('/'))
            if not os.path.exists(ipath):
                while src.startswith('../'):
                    src = src[3:]
            img['src'] = src
        try:
            # if there is only a single table with a single element
            # in the body, replace it by the contents of this single element
            tables = soup.body.findAll('table', recursive=False)
            if tables and len(tables) == 1:
                trs = tables[0].findAll('tr', recursive=False)
                if trs and len(trs) == 1:
                    tds = trs[0].findAll('td', recursive=False)
                    if tds and len(tds) == 1:
                        tdContents = tds[0].contents
                        tableIdx = soup.body.contents.index(tables[0])
                        tables[0].extract()
                        while tdContents:
                            soup.body.insert(tableIdx, tdContents.pop())
        except:
            pass
        # do not prettify, it would reformat the <pre> tags!
        try:
            ans = str(soup)
            self.re_encoded_files.add(os.path.abspath(htmlpath))
            return ans
        except RuntimeError:
            return data
Ejemplo n.º 54
0
def comments_to_html(comments):
    '''
    Convert random comment text to normalized, xml-legal block of <p>s
    'plain text' returns as
    <p>plain text</p>

    'plain text with <i>minimal</i> <b>markup</b>' returns as
    <p>plain text with <i>minimal</i> <b>markup</b></p>

    '<p>pre-formatted text</p> returns untouched

    'A line of text\n\nFollowed by a line of text' returns as
    <p>A line of text</p>
    <p>Followed by a line of text</p>

    'A line of text.\nA second line of text.\rA third line of text' returns as
    <p>A line of text.<br />A second line of text.<br />A third line of text.</p>

    '...end of a paragraph.Somehow the break was lost...' returns as
    <p>...end of a paragraph.</p>
    <p>Somehow the break was lost...</p>

    Deprecated HTML returns as HTML via BeautifulSoup()

    '''
    if not comments:
        return u'<p></p>'
    if not isinstance(comments, unicode):
        comments = comments.decode(preferred_encoding, 'replace')

    if comments.lstrip().startswith('<'):
        # Comment is already HTML do not mess with it
        return comments

    if '<' not in comments:
        comments = prepare_string_for_xml(comments)
        parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />')
                for x in comments.split('\n\n')]
        return '\n'.join(parts)

    if sanitize_pat.search(comments) is not None:
        try:
            return sanitize_comments_html(comments)
        except:
            import traceback
            traceback.print_exc()
            return u'<p></p>'

    # Explode lost CRs to \n\n
    comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.',
        '.\r'), comments)
    for lost_cr in lost_cr_pat.finditer(comments):
        comments = comments.replace(lost_cr.group(),
                                    '%s%s\n\n%s' % (lost_cr.group(1),
                                                    lost_cr.group(2),
                                                    lost_cr.group(3)))

    comments = comments.replace(u'\r', u'')
    # Convert \n\n to <p>s
    comments = comments.replace(u'\n\n', u'<p>')
    # Convert solo returns to <br />
    comments = comments.replace(u'\n', '<br />')
    # Convert two hyphens to emdash
    comments = comments.replace('--', '&mdash;')

    soup = BeautifulSoup(comments)
    result = BeautifulSoup()
    rtc = 0
    open_pTag = False

    all_tokens = list(soup.contents)
    for token in all_tokens:
        if type(token) is NavigableString:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc,prepare_string_for_xml(token))
            ptc += 1
        elif type(token) in (CData, Comment, Declaration,
                ProcessingInstruction):
            continue
        elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a',
                'hr']:
            if not open_pTag:
                pTag = Tag(result,'p')
                open_pTag = True
                ptc = 0
            pTag.insert(ptc, token)
            ptc += 1
        else:
            if open_pTag:
                result.insert(rtc, pTag)
                rtc += 1
                open_pTag = False
                ptc = 0
            result.insert(rtc, token)
            rtc += 1

    if open_pTag:
        result.insert(rtc, pTag)

    for p in result.findAll('p'):
        p['class'] = 'description'

    for t in result.findAll(text=True):
        t.replaceWith(prepare_string_for_xml(unicode(t)))

    return result.renderContents(encoding=None)
Ejemplo n.º 55
0
 def start_fetch(self, url):
     soup = BeautifulSoup(u'<a href="' + url + '" />')
     res = self.process_links(soup, url, 0, into_dir='')
     self.log.debug(url, 'saved to', res)
     return res
Ejemplo n.º 56
0
    def get_soup(self, src, url=None):
        nmassage = []
        nmassage.extend(self.preprocess_regexps)
        # Remove comments as they can leave detritus when extracting tags leaves
        # multiple nested comments
        nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: ''))
        usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0]
        usrc = self.preprocess_raw_html(usrc, url)
        for pat, repl in nmassage:
            usrc = pat.sub(repl, usrc)
        soup = BeautifulSoup(usrc)

        replace = self.prepreprocess_html_ext(soup)
        if replace is not None:
            replace = xml_to_unicode(replace,
                                     self.verbose,
                                     strip_encoding_pats=True)[0]
            for pat, repl in nmassage:
                replace = pat.sub(repl, replace)
            soup = BeautifulSoup(replace)

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    self.keep_only_tags = [self.keep_only_tags]
                for spec in self.keep_only_tags:
                    for tag in soup.find('body').findAll(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replaceWith(body)
            except AttributeError:  # soup has no body element
                pass

        def remove_beyond(tag, next):
            while tag is not None and getattr(tag, 'name', None) != 'body':
                after = getattr(tag, next)
                while after is not None:
                    ns = getattr(tag, next)
                    after.extract()
                    after = ns
                tag = tag.parent

        if self.remove_tags_after is not None:
            rt = [self.remove_tags_after] if isinstance(
                self.remove_tags_after, dict) else self.remove_tags_after
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'nextSibling')

        if self.remove_tags_before is not None:
            rt = [self.remove_tags_before] if isinstance(
                self.remove_tags_before, dict) else self.remove_tags_before
            for spec in rt:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previousSibling')

        for kwds in self.remove_tags:
            for tag in soup.findAll(**kwds):
                tag.extract()
        return self.preprocess_html_ext(soup)
    def update_results(self, trigger):
        #self._log_location(trigger)
        reader_to_match = str(self.find_annotations_reader_comboBox.currentText())
        color_to_match = str(self.find_annotations_color_comboBox.currentText())
        text_to_match = str(self.find_annotations_text_lineEdit.text())
        note_to_match = str(self.find_annotations_note_lineEdit.text())

        from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t()
        to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t()

        annotation_map = self.annotated_books_scanner.annotation_map
        #field = self.prefs.get("cfg_annotations_destination_field", None)
        field = get_cc_mapping('annotations', 'field', None)

        db = self.opts.gui.current_db
        matched_titles = []
        self.matched_ids = set()

        for cid in annotation_map:
            mi = db.get_metadata(cid, index_is_id=True)
            soup = None
            if field == 'Comments':
                if mi.comments:
                    soup = BeautifulSoup(mi.comments)
            else:
                if mi.get_user_metadata(field, False)['#value#'] is not None:
                    soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#'])
            if soup:
                uas = soup.findAll('div', 'annotation')
                for ua in uas:
                    # Are we already logged?
                    if cid in self.matched_ids:
                        continue

                    # Check reader
                    if reader_to_match != self.GENERIC_READER:
                        this_reader = ua['reader']
                        if this_reader != reader_to_match:
                            continue

                    # Check color
                    if color_to_match != self.GENERIC_STYLE:
                        this_color = ua.find('table')['color']
                        if this_color != color_to_match:
                            continue

                    # Check date range, allow for mangled timestamp
                    try:
                        timestamp = float(ua.find('td', 'timestamp')['uts'])
                        if timestamp < from_date or timestamp > to_date:
                            continue
                    except:
                        continue

                    highlight_text = ''
                    try:
                        pels = ua.findAll('p', 'highlight')
                        for pel in pels:
                            highlight_text += pel.string + '\n'
                    except:
                        pass
                    if text_to_match > '':
                        if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE):
                            continue

                    note_text = ''
                    try:
                        nels = ua.findAll('p', 'note')
                        for nel in nels:
                            note_text += nel.string + '\n'
                    except:
                        pass
                    if note_to_match > '':
                        if not re.search(note_to_match, note_text, flags=re.IGNORECASE):
                            continue

                    # If we made it this far, add the id to matched_ids
                    self.matched_ids.add(cid)
                    matched_titles.append(mi.title)

        # Update the results box
        matched_titles.sort()
        if len(annotation_map):
            if len(matched_titles):
                first_match = ("<i>%s</i>" % matched_titles[0])
                if len(matched_titles) == 1:
                    results = first_match
                else:
                    results = first_match + (" and %d more." % (len(matched_titles) - 1))
                self.result_label.setText('<p style="color:blue">{0}</p>'.format(results))
            else:
                self.result_label.setText('<p style="color:red">no matches</p>')
        else:
            self.result_label.setText('<p style="color:red">no annotated books in library</p>')

        self.resize_dialog()
Ejemplo n.º 58
0
    def preprocess_html(self, soup):
        if self.webEdition & (self.oldest_article>0):
            date_tag = soup.find(True,attrs={'class': ['dateline','date']})
            if date_tag:
                date_str = self.tag_to_string(date_tag,use_alt=False)
                date_str = date_str.replace('Published:','')
                date_items = date_str.split(',')
                try:
                    datestring = date_items[0]+' '+date_items[1]
                    article_date = self.decode_us_date(datestring)
                except:
                    article_date = date.today()
                if article_date < self.earliest_date:
                    self.log("Skipping article dated %s" % date_str)
                    return None

        #all articles are from today, no need to print the date on every page
        try:
            if not self.webEdition:
                date_tag = soup.find(True,attrs={'class': ['dateline','date']})
                if date_tag:
                    date_tag.extract()
        except:
            self.log("Error removing the published date")

        if self.useHighResImages:
            try:
                #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                if enlargeThisList:
                    for popupref in enlargeThisList:
                        popupreflink = popupref.find('a')
                        if popupreflink:
                            reflinkstring = str(popupreflink['href'])
                            refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
                            refend = reflinkstring.find(".html", refstart) + len(".html")
                            reflinkstring = reflinkstring[refstart:refend]

                            popuppage = self.browser.open(reflinkstring)
                            popuphtml = popuppage.read()
                            popuppage.close()
                            if popuphtml:
                                st = time.localtime()
                                year = str(st.tm_year)
                                month = "%.2d" % st.tm_mon
                                day = "%.2d" % st.tm_mday
                                imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/')
                                highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
                                popupSoup = BeautifulSoup(popuphtml)
                                highResTag = popupSoup.find('img', {'src':highResImageLink})
                                if highResTag:
                                    try:
                                        newWidth = highResTag['width']
                                        newHeight = highResTag['height']
                                        imageTag = popupref.parent.find("img")
                                    except:
                                        self.log("Error: finding width and height of img")
                                    popupref.extract()
                                    if imageTag:
                                        try:
                                            imageTag['src'] = highResImageLink
                                            imageTag['width'] = newWidth
                                            imageTag['height'] = newHeight
                                        except:
                                            self.log("Error setting the src width and height parameters")
            except Exception:
                self.log("Error pulling high resolution images")

            try:
                #remove "Related content" bar
                runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft  ','articleInline runaroundLeft  lastArticleInline']})
                if runAroundsFound:
                    for runAround in runAroundsFound:
                        #find all section headers
                        hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
                        if hlines:
                            for hline in hlines:
                                hline.extract()

                        #find all section headers
                        hlines = runAround.findAll('h6')
                        if hlines:
                            for hline in hlines:
                                hline.extract()
            except:
                self.log("Error removing related content bar")


            try:
                #in case pulling images failed, delete the enlarge this text
                enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
                if enlargeThisList:
                    for popupref in enlargeThisList:
                        popupref.extract()
            except:
                self.log("Error removing Enlarge this text")

        return self.strip_anchors(soup)
Ejemplo n.º 59
0
def merge_annotations(parent, cid, old_soup, new_soup):
    '''
    old_soup, new_soup: BeautifulSoup()
    Need to strip <hr>, re-sort based on location, build new merged_soup
    with optional interleaved <hr> elements.
    '''
    TRANSIENT_DB = 'transient'
    debug_print("merge_annotations - cid=", cid)
    debug_print("merge_annotations - old_soup=", old_soup)
    debug_print("merge_annotations - new_soup=", new_soup)

    # Fetch preferred merge index technique
    merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash')

    if merge_index == 'hash':
        # Get the hashes of any existing annotations
        oiuas = old_soup.findAll('div', 'annotation')
        old_hashes = set([ua['hash'] for ua in oiuas])
        debug_print("old hashes=", old_hashes)

        # Extract old user_annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            debug_print("Getting old annotations - count=", len(ouas))
            debug_print("Getting old annotations - old_soup=", old_soup)
            debug_print("Getting old annotations - ouas=", ouas)
            ouas.extract()
            debug_print("Getting old annotations - ouas after extract=", ouas)
            debug_print("Getting old annotations - old_soup after extract=",
                        old_soup)

            # Capture existing annotations
            annotation_list = parent.opts.db.capture_content(
                ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(
                parent.opts.db.rerender_to_html_from_list(annotation_list))
            debug_print("Getting old annotations - regurgitated_soup=",
                        regurgitated_soup)
        else:
            regurgitated_soup = BeautifulSoup()

        # Find new annotations
        uas = new_soup.findAll('div', 'annotation')
        new_hashes = set([ua['hash'] for ua in uas])
        debug_print("new_hashes=", sorted(new_hashes))
        debug_print("old hashes=", sorted(old_hashes))
        debug_print("new_hashes.difference(old_hashes)=",
                    new_hashes.difference(old_hashes))

        updates = list(new_hashes.difference(old_hashes))
        debug_print("differences between old and new hashs - updates=",
                    updates)
        if ouas is not None:
            if len(updates):
                debug_print("have updates and ouas")
                # Append new to regurgitated
                dtc = len(regurgitated_soup.div)
                debug_print("length regurgitated_soup - dtc=", dtc)
                for new_annotation_id in updates:
                    debug_print(
                        "extending regurgitated_soup - new_annotation_id=",
                        new_annotation_id)
                    new_annotation = new_soup.find('div',
                                                   {'hash': new_annotation_id})
                    regurgitated_soup.div.insert(dtc, new_annotation)
                    dtc += 1
            merged_soup = unicode(sort_merged_annotations(regurgitated_soup))
        else:
            debug_print("have updates and ouas")
            if not regurgitated_soup == BeautifulSoup():
                debug_print("adding old_soup and new_soup")
                debug_print("unicode(regurgitated_soup)=",
                            unicode(regurgitated_soup))
                debug_print("unicode(new_soup)=", unicode(new_soup))
                merged_soup = unicode(regurgitated_soup) + unicode(new_soup)
            else:
                debug_print("just new_soup")
                merged_soup = unicode(new_soup)
        debug_print("merged_soup=", merged_soup)
        return merged_soup

    elif merge_index == 'timestamp':
        timestamps = {}
        # Get the timestamps and hashes of the stored annotations
        suas = old_soup.findAll('div', 'annotation')
        for sua in suas:
            try:
                timestamp = sua.find('td', 'timestamp')['uts']
                timestamps[timestamp] = {'stored_hash': sua['hash']}
            except:
                continue

        # Rerender stored annotations
        ouas = old_soup.find('div', 'user_annotations')
        if ouas:
            ouas.extract()

            # Capture existing annotations
            annotation_list = parent.opts.db.capture_content(
                ouas, cid, TRANSIENT_DB)

            # Regurgitate old_soup with current CSS
            regurgitated_soup = BeautifulSoup(
                parent.opts.db.rerender_to_html_from_list(annotation_list))

        # Add device annotation timestamps and hashes
        duas = new_soup.findAll('div', 'annotation')
        for dua in duas:
            try:
                timestamp = dua.find('td', 'timestamp')['uts']
                if timestamp in timestamps:
                    timestamps[timestamp]['device_hash'] = dua['hash']
                else:
                    timestamps[timestamp] = {'device_hash': dua['hash']}
            except:
                print("ERROR: malformed timestamp in device annotation")
                print(dua.prettify())

        merged_soup = BeautifulSoup(ANNOTATIONS_HEADER)

        for ts in sorted(timestamps):
            if 'stored_hash' in timestamps[
                    ts] and not 'device_hash' in timestamps[ts]:
                # Stored only - add from regurgitated_soup
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif not 'stored_hash' in timestamps[
                    ts] and 'device_hash' in timestamps[ts]:
                # Device only - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            elif timestamps[ts]['stored_hash'] == timestamps[ts][
                    'device_hash']:
                # Stored matches device - add from regurgitated_soup, as user may have modified
                annotation = regurgitated_soup.find(
                    'div', {'hash': timestamps[ts]['stored_hash']})

            elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']:
                # Device has been updated since initial capture - add from new_soup
                annotation = new_soup.find(
                    'div', {'hash': timestamps[ts]['device_hash']})

            else:
                continue

            merged_soup.div.append(annotation)

        return unicode(sort_merged_annotations(merged_soup))
Ejemplo n.º 60
0
    def generate_html(comments):
        args = dict(xmlns=XHTML_NS,
                    title_str=title_str,
                    css=css,
                    title=title,
                    author=author,
                    publisher=publisher,
                    pubdate_label=_('Published'), pubdate=pubdate,
                    series_label=_('Series'), series=series,
                    rating_label=_('Rating'), rating=rating,
                    tags_label=_('Tags'), tags=tags,
                    comments=comments,
                    footer='',
                    searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list),
                    )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
                elif dt == 'comments':
                    val = val or ''
                    display = m.get('display', {})
                    ctype = display.get('interpret_as') or 'html'
                    if ctype == 'long-text':
                        val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
                    elif ctype == 'short-text':
                        val = '<span>%s</span>' % escape(val)
                    elif ctype == 'markdown':
                        val = markdown(val)
                    else:
                        val = comments_to_html(val)
                    args[dkey] = val
                else:
                    args[dkey] = escape(val)
                args[dkey+'_label'] = escape(display_name)
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class':'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class':'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class':'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
                soup.renderContents('utf-8').decode('utf-8'))