def search_for_asin_on_amazon(self, query): '''Search for book's asin on amazon using given query''' query = urlencode({'keywords': query}) url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query try: response = open_url(self._connections['amazon'], url) except PageDoesNotExist: return None # check to make sure there are results if ('did not match any products' in response and 'Did you mean:' not in response and 'so we searched in All Departments' not in response): return None soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results: return None for result in results: if 'Buy now with 1-Click' in str(result): asin_search = AMAZON_ASIN_PAT.search(str(result)) if asin_search: return asin_search.group(1) return None
def download_html(target_url): soup = "" html_page = "" html_raw = "" try: #~ connection = urllib2.urlopen(target_url) connection = myurlopen(target_url) html_raw = connection.read() soup = BeautifulSoup(html_raw) if hasattr(soup, "findAll"): if DEBUG: print("BeautifulSoup3 is being used in download_html") elif hasattr(soup, "find_all"): if DEBUG: print("BeautifulSoup4 is being used in download_html") else: if DEBUG: print("BeautifulSoup???? is being used in download_html") connection.close() del connection html_page = soup.prettify() # take note... except Exception as e: if DEBUG: print(unicode_type(e)) pass if not soup: if DEBUG: print("not soup") soup = "" if not html_page: html_page = "" if not html_raw: html_raw = "" return html_page, soup, html_raw
def get_annotations_date_range(self): ''' Find oldest, newest annotation in annotated books initial values of self.oldest, self.newest are reversed to allow update comparisons if no annotations, restore to correct values ''' annotations_found = False for cid in self.annotation_map: mi = self.cdb.get_metadata(cid, index_is_id=True) if self.field == 'Comments': soup = BeautifulSoup(mi.comments) else: soup = BeautifulSoup( mi.get_user_metadata(self.field, False)['#value#']) uas = soup.findAll('div', 'annotation') for ua in uas: annotations_found = True timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < self.oldest_annotation: self.oldest_annotation = timestamp if timestamp > self.newest_annotation: self.newest_annotation = timestamp if not annotations_found: temp = self.newest_annotation self.newest_annotation = self.oldest_annotation self.oldest_annotation = temp
def existing_annotations(parent, field, return_all=False): ''' Return count of existing annotations, or existence of any ''' import calibre_plugins.annotations.config as cfg annotation_map = [] if field: db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] for i, record in enumerate(db.data.iterall()): mi = db.get_metadata(record[id], index_is_id=True) if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#']) if soup.find('div', 'user_annotations') is not None: annotation_map.append(mi.id) if not return_all: break if return_all: _log_location("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data))) return annotation_map
def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"]) images = soup.findAll("img") if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover": img = images[0] img = os.path.join(base, *img["src"].split("/")) if os.path.exists(img): return open(img, "rb").read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find("body") if body is not None: text = u"".join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll("img", src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]["src"].split("/")) if os.path.exists(img): return open(img, "rb").read()
def get_annotations_date_range(self): ''' Find oldest, newest annotation in annotated books initial values of self.oldest, self.newest are reversed to allow update comparisons if no annotations, restore to correct values ''' annotations_found = False for cid in self.annotation_map: mi = self.cdb.get_metadata(cid, index_is_id=True) if self.field == 'Comments': soup = BeautifulSoup(mi.comments) else: soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#']) uas = soup.findAll('div', 'annotation') for ua in uas: annotations_found = True timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < self.oldest_annotation: self.oldest_annotation = timestamp if timestamp > self.newest_annotation: self.newest_annotation = timestamp if not annotations_found: temp = self.newest_annotation self.newest_annotation = self.oldest_annotation self.oldest_annotation = temp
def save_soup(soup, target): ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def existing_annotations(parent, field, return_all=False): """ Return count of existing annotations, or existence of any """ # import calibre_plugins.marvin_manager.config as cfg _log_location(field) annotation_map = [] if field: db = parent.opts.gui.current_db id = db.FIELD_MAP["id"] for i, record in enumerate(db.data.iterall()): mi = db.get_metadata(record[id], index_is_id=True) if field == "Comments": if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"]) if soup.find("div", "user_annotations") is not None: annotation_map.append(mi.id) if not return_all: break if return_all: _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data))) _log("annotation_map: %s" % repr(annotation_map)) else: _log("no active field") return annotation_map
def save_soup(soup, target): ns = BeautifulSoup( '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />' ) nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists( path) and os.path.isabs(path): tag[key] = unicode_path( relpath(path, selfdir).replace(os.sep, '/')) html = unicode_type(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def find_all_annotated_books(self): ''' Find all annotated books in library ''' if not self.field: self._log_location() self._log("No custom column field specified, cannot find annotated books") return if not (self.field in self.cdb.custom_field_keys() or self.field == 'Comments'): self._log_location() self._log("No custom column field specified, cannot find annotated books") return id = self.cdb.FIELD_MAP['id'] for record in self.cdb.data.iterall(): mi = self.cdb.get_metadata(record[id], index_is_id=True) if self.field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#']) if soup.find('div', 'user_annotations') is not None: self.annotation_map.append(mi.id)
def existing_annotations(parent, field, return_all=False): ''' Return count of existing annotations, or existence of any ''' #import calibre_plugins.marvin_manager.config as cfg _log_location(field) annotation_map = [] if field: db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] for i, record in enumerate(db.data.iterall()): mi = db.get_metadata(record[id], index_is_id=True) if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup( mi.get_user_metadata(field, False)['#value#']) if soup.find('div', 'user_annotations') is not None: annotation_map.append(mi.id) if not return_all: break if return_all: _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data))) _log("annotation_map: %s" % repr(annotation_map)) else: _log("no active field") return annotation_map
def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'font', 'br']) images = soup.findAll('img') if matches is None and len(images) == 1 and \ images[0].get('alt', '')=='cover': img = images[0] img = os.path.join(base, *img['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find('body') if body is not None: text = u''.join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll('img', src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read()
def search_for_asin_on_amazon(self, query): '''Search for book's asin on amazon using given query''' query = urlencode({'keywords': query}) url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[ 9:] + '&' + query try: response = open_url(self._connections['amazon'], url) except PageDoesNotExist: return None # check to make sure there are results if ('did not match any products' in response and 'Did you mean:' not in response and 'so we searched in All Departments' not in response): return None soup = BeautifulSoup(response) results = soup.findAll('div', {'class': 's-result-list'}) if not results: return None for result in results: if 'Buy now with 1-Click' in str(result): asin_search = AMAZON_ASIN_PAT.search(str(result)) if asin_search: return asin_search.group(1) return None
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class': 'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class': 'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class':'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class':'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL | re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def find_all_annotated_books(self): ''' Find all annotated books in library ''' self._log_location("field: {0}".format(self.field)) cids = self.cdb.search_getting_ids('formats:EPUB', '') for cid in cids: mi = self.cdb.get_metadata(cid, index_is_id=True) raw = mi.get_user_metadata(self.field, False) if raw['#value#'] is not None: soup = BeautifulSoup(raw['#value#']) if soup.find('div', 'user_annotations') is not None: self.annotation_map.append(mi.id)
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_asin(self, connection): query = urlencode({'keywords': '%s - %s' % (self._title, self._author)}) try: connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = connection.getresponse().read() except: try: connection.close() if self._proxy: connection = HTTPConnection(self._http_address, self._http_port) connection.set_tunnel('www.amazon.com', 80) else: connection = HTTPConnection('www.amazon.com') connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = connection.getresponse().read() except: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON raise Exception(self._status_message) # check to make sure there are results if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE raise Exception(self._status_message) soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results or len(results) == 0: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE raise Exception(self._status_message) for r in results: if 'Buy now with 1-Click' in str(r): asinSearch = self.AMAZON_ASIN_PAT.search(str(r)) if asinSearch: self._asin = asinSearch.group(1) mi = self._db.get_metadata(self._book_id) identifiers = mi.get_identifiers() identifiers['mobi-asin'] = self._asin mi.set_identifiers(identifiers) self._db.set_metadata(self._book_id, mi) self._book_settings.prefs['asin'] = self._asin return connection self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN raise Exception(self._status_message)
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='') for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key + '_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class': 'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class': 'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class': 'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def get_metadata_from_reader(rdr): raw = rdr.GetFile(rdr.home) home = BeautifulSoup( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) title = rdr.title try: x = rdr.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) if publisher: mi.publisher = publisher isbn = _get_isbn(home) if isbn: mi.isbn = isbn comments = _get_comments(home) if comments: mi.comments = comments cdata = _get_cover(home, rdr) if cdata is not None: mi.cover_data = ('jpg', cdata) return mi
def construct(self, bookmark_notes): ''' bookmark_notes: {loc_sort: {color, location, note}…} Optionally include <hr> between booknotes ''' soup = None if bookmark_notes: soup = BeautifulSoup( '''<div class="{0}"></div>'''.format('bookmark_notes')) dtc = 0 for i, location_sort in enumerate(sorted(bookmark_notes.keys())): soup.div.insert( dtc, self.BOOKMARK_TEMPLATE.format( location_sort, bookmark_notes[location_sort]['color'], self._get_style('Location'), bookmark_notes[location_sort]['location'], self._get_style('Note'), bookmark_notes[location_sort]['note'])) dtc += 1 if (i < len(bookmark_notes) - 1 and plugin_prefs.get('appearance_hr_checkbox', False)): soup.div.insert( dtc, plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />')) dtc += 1 return soup
def _remove_old_style(self, html): ''' Remove the old style tag, finalize soup in preparation for styling ''' unstyled_soup = BeautifulSoup(html) head = unstyled_soup.find("head") voc = unstyled_soup.body.find('div', {'class': 'vocabulary'}) tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a) dart = random.randrange(len(tds)) self.td = tds[dart] self.oh = self.td.a['href'] self.td.a['href'] = self._finalize() old_style = head.find('style') if old_style: old_style.extract() return unstyled_soup
def _inject_css(self, html): ''' stick a <style> element into html ''' css = self.prefs.get('injected_css', None) if css: try: styled_soup = BeautifulSoup(html) head = styled_soup.find("head") style_tag = Tag(styled_soup, 'style') style_tag['type'] = "text/css" style_tag.insert(0, css) head.insert(0, style_tag) html = styled_soup.renderContents() except: return html return (html)
def _inject_css(self, html): ''' stick a <style> element into html ''' css = self.prefs.get('injected_css', None) if css: try: styled_soup = BeautifulSoup(html) head = styled_soup.find("head") style_tag = Tag(styled_soup, 'style') style_tag['type'] = "text/css" style_tag.insert(0, css) head.insert(0, style_tag) html = styled_soup.renderContents() except: return html return(html)
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup): ''' comments_soup: comments potentially with user_annotations ''' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( plugin_prefs.get( 'COMMENTS_DIVIDER', '· · • · ✦ · • · ·' )) # Remove the old comments_divider cds = comments_soup.find('div', 'comments_divider') if cds: cds.extract() # Existing annotations? uas = comments_soup.find('div', 'user_annotations') if uas: # Save the existing annotations to old_soup old_soup = BeautifulSoup(unicode(uas)) # Remove any hrs from old_soup hrs = old_soup.findAll('hr') if hrs: for hr in hrs: hr.extract() # Remove the existing annotations from comments_soup uas.extract() # Merge old_soup with new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(merge_annotations(parent, cid, old_soup, new_soup)) else: # No existing, just merge comments_soup with already sorted new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(new_soup) return merged_soup
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='' ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def sort_merged_annotations(merged_soup): ''' Input: a combined group of user annotations Output: sorted by location ''' include_hr = plugin_prefs.get('appearance_hr_checkbox', False) locations = merged_soup.findAll(location_sort=True) locs = [loc['location_sort'] for loc in locations] locs.sort() sorted_soup = BeautifulSoup(ANNOTATIONS_HEADER) dtc = 0 for i, loc in enumerate(locs): next_div = merged_soup.find(attrs={'location_sort': loc}) sorted_soup.div.insert(dtc, next_div) dtc += 1 if include_hr and i < len(locs) - 1: sorted_soup.div.insert(dtc, BeautifulSoup(plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />'))) dtc += 1 return sorted_soup
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup): ''' comments_soup: comments potentially with user_annotations ''' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( plugin_prefs.get('COMMENTS_DIVIDER', '· · • · ✦ · • · ·')) # Remove the old comments_divider cds = comments_soup.find('div', 'comments_divider') if cds: cds.extract() # Existing annotations? uas = comments_soup.find('div', 'user_annotations') if uas: # Save the existing annotations to old_soup old_soup = BeautifulSoup(unicode(uas)) # Remove any hrs from old_soup hrs = old_soup.findAll('hr') if hrs: for hr in hrs: hr.extract() # Remove the existing annotations from comments_soup uas.extract() # Merge old_soup with new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(merge_annotations(parent, cid, old_soup, new_soup)) else: # No existing, just merge comments_soup with already sorted new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(new_soup) return merged_soup
def preview_css(self): ''' Construct a dummy set of notes and annotation for preview purposes Modeled after book_status:_get_formatted_annotations() ''' from calibre_plugins.marvin_manager.annotations import ( ANNOTATIONS_HTML_TEMPLATE, Annotation, Annotations, BookNotes, BookmarkNotes) # Assemble the preview soup soup = BeautifulSoup(ANNOTATIONS_HTML_TEMPLATE) # Load the CSS from MXD resources path = os.path.join(self.parent.opts.resources_path, 'css', 'annotations.css') with open(path, 'rb') as f: css = f.read().decode('utf-8') style_tag = Tag(soup, 'style') style_tag.insert(0, css) soup.head.style.replaceWith(style_tag) # Assemble the sample Book notes book_notes_soup = BookNotes().construct(self.sample_book_notes) soup.body.append(book_notes_soup) cd_tag = Tag(soup, 'div', [('class', "divider")]) soup.body.append(cd_tag) # Assemble the sample Bookmark notes bookmark_notes_soup = BookmarkNotes().construct(self.sample_bookmark_notes) soup.body.append(bookmark_notes_soup) cd_tag = Tag(soup, 'div', [('class', "divider")]) soup.body.append(cd_tag) # Assemble the sample annotations pas = Annotations(None, title="Preview") pas.annotations.append(Annotation(self.sample_ann_1)) pas.annotations.append(Annotation(self.sample_ann_2)) pas.annotations.append(Annotation(self.sample_ann_3)) annotations_soup = pas.to_HTML(pas.create_soup()) soup.body.append(annotations_soup) self.parent.wv.setHtml(unicode(soup.renderContents()))
def get_asin(self): query = urlencode({'keywords': '%s' % self.title_and_author}) try: self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = self._aConnection.getresponse().read() except: try: self._aConnection.close() if self._proxy: self._aConnection = HTTPConnection(self._http_address, self._http_port) self._aConnection.set_tunnel('www.amazon.com', 80) else: self._aConnection = HTTPConnection('www.amazon.com') self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = self._aConnection.getresponse().read() except: return None # check to make sure there are results if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response: return None soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results or len(results) == 0: return None for r in results: if 'Buy now with 1-Click' in str(r): asinSearch = self.AMAZON_ASIN_PAT.search(str(r)) if asinSearch: asin = asinSearch.group(1) mi = self._db.get_metadata(self._book_id) identifiers = mi.get_identifiers() identifiers['mobi-asin'] = asin mi.set_identifiers(identifiers) self._db.set_metadata(self._book_id, mi) return asin
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, "rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll("a"): if not a.has_key("href"): continue purl = urlparse(unquote(a["href"])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = "".join([unicode(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll('a'): if not a.has_key('href'): continue purl = urlparse(unquote(a['href'])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def construct(self, book_notes): ''' Given a list of notes, render HTML ''' soup = None if book_notes: soup = BeautifulSoup( '''<div class="{0}"></div>'''.format('book_notes')) for note in book_notes: div_tag = Tag(soup, 'div', [('class', "book_note")]) p_tag = Tag(soup, 'p', [('class', "book_note"), ('style', "{0}".format(self._get_note_style()))]) p_tag.append(note) div_tag.append(p_tag) soup.div.append(div_tag) return soup
def process_articles(self, title, article, baseurl, into_dir='links'): res = '' diskpath = os.path.join(self.current_dir, into_dir) ''' 必须添加Elemnt 的 Class ,否则框架会自动添加 ''' html = "<html><head><title>" + title + "</title><style type='text/css'>" + self.extra_css + "</style></head><body><div class='posts'>" for a in article: if self.show_progress: print '.', sys.stdout.flush() sys.stdout.flush() # self.log("article:"+str(a)) html += "<div class='post'><div class='post-frame'><img src='" + a['href'] + "' class='post-img'></img><span>"+a['tags']+"</span></div>" html += "</div>" html += "</div></body></html>" soup = BeautifulSoup(html) self.log.debug('Processing images...') try: self.process_images(soup, baseurl) except Exception: self.lof('Exception') finally: self.log('end processing images') _fname = title if not isinstance(_fname, unicode): _fname.decode('latin1', 'replace') _fname = _fname.encode('ascii', 'replace').replace('%', '').replace(os.sep, '') _fname = ascii_filename(_fname) _fname = os.path.splitext(_fname)[0] + '.xhtml' res = os.path.join(diskpath, _fname) self.downloaded_paths.append(res) nurl = baseurl + title self.filemap[nurl] = res save_soup(soup, res) self.downloaded_paths.append(res) return res
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = Tag(ka_soup, 'div') divTag['class'] = 'user_annotations' # Add the last-read location spanTag = Tag(ka_soup, 'span') spanTag['style'] = 'font-weight:bold' if bookmark.book_format == 'pdf': spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % \ dict(time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) else: spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % \ dict(time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, Tag(ka_soup, 'br')) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]['text']: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'], text=(user_notes[location]['text'] if \ user_notes[location]['type'] == 'Note' else \ '<i>%s</i>' % user_notes[location]['text']))) else: if bookmark.book_format == 'pdf': annotations.append( _('<b>Page %(dl)d • %(typ)s</b><br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) else: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />') % \ dict(dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) for annotation in annotations: divTag.insert(dtc, annotation) dtc += 1 ka_soup.insert(0, divTag) return ka_soup
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = Tag(ka_soup,'div') divTag['class'] = 'user_annotations' # Add the last-read location spanTag = Tag(ka_soup, 'span') spanTag['style'] = 'font-weight:bold' if bookmark.book_format == 'pdf': spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict( time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) else: spanTag.insert(0,NavigableString( _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict( time=strftime(u'%x', timestamp.timetuple()), loc=last_read_location, pr=percent_read))) divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, Tag(ka_soup,'br')) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]['text']: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'], text=(user_notes[location]['text'] if user_notes[location]['type'] == 'Note' else '<i>%s</i>' % user_notes[location]['text']))) else: if bookmark.book_format == 'pdf': annotations.append( _('<b>Page %(dl)d • %(typ)s</b><br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) else: annotations.append( _('<b>Location %(dl)d • %(typ)s</b><br />') % dict( dl=user_notes[location]['displayed_location'], typ=user_notes[location]['type'])) for annotation in annotations: divTag.insert(dtc, annotation) dtc += 1 ka_soup.insert(0,divTag) return ka_soup
def generate_annotation_html(self, bookmark): from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString # Returns <div class="user_annotations"> ... </div> last_read_location = bookmark.last_read_location timestamp = datetime.datetime.utcfromtimestamp(bookmark.timestamp) percent_read = bookmark.percent_read ka_soup = BeautifulSoup() dtc = 0 divTag = Tag(ka_soup, "div") divTag["class"] = "user_annotations" # Add the last-read location spanTag = Tag(ka_soup, "span") spanTag["style"] = "font-weight:bold" if bookmark.book_format == "pdf": spanTag.insert( 0, NavigableString( _("%(time)s<br />Last Page Read: %(loc)d (%(pr)d%%)") % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read) ), ) else: spanTag.insert( 0, NavigableString( _("%(time)s<br />Last Page Read: Location %(loc)d (%(pr)d%%)") % dict(time=strftime(u"%x", timestamp.timetuple()), loc=last_read_location, pr=percent_read) ), ) divTag.insert(dtc, spanTag) dtc += 1 divTag.insert(dtc, Tag(ka_soup, "br")) dtc += 1 if bookmark.user_notes: user_notes = bookmark.user_notes annotations = [] # Add the annotations sorted by location # Italicize highlighted text for location in sorted(user_notes): if user_notes[location]["text"]: annotations.append( _("<b>Location %(dl)d • %(typ)s</b><br />%(text)s<br />") % dict( dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"], text=( user_notes[location]["text"] if user_notes[location]["type"] == "Note" else "<i>%s</i>" % user_notes[location]["text"] ), ) ) else: if bookmark.book_format == "pdf": annotations.append( _("<b>Page %(dl)d • %(typ)s</b><br />") % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"]) ) else: annotations.append( _("<b>Location %(dl)d • %(typ)s</b><br />") % dict(dl=user_notes[location]["displayed_location"], typ=user_notes[location]["type"]) ) for annotation in annotations: divTag.insert(dtc, annotation) dtc += 1 ka_soup.insert(0, divTag) return ka_soup
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' if False: ''' Older technique: Use hashes to merge annotations ''' #Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode( sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode( sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup else: ''' Newer technique: Use timestamps to merge annotations ''' timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: #print("sua: %s" % sua.prettify()) timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} merged_soup = BeautifulSoup(ANNOTATIONS_HEADER) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts][ 'device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_soup.div.append(annotation) return unicode(sort_merged_annotations(merged_soup))
def _reformat(self, data, htmlpath): if self.input_encoding: data = data.decode(self.input_encoding) try: data = xml_to_unicode(data, strip_encoding_pats=True)[0] soup = BeautifulSoup(data) except ValueError: # hit some strange encoding problems... self.log.exception("Unable to parse html for cleaning, leaving it") return data # nuke javascript... [s.extract() for s in soup('script')] # See if everything is inside a <head> tag # https://bugs.launchpad.net/bugs/1273512 body = soup.find('body') if body is not None and body.parent.name == 'head': html = soup.find('html') html.insert(len(html), body) # remove forward and back nav bars from the top/bottom of each page # cos they really f**k with the flow of things and generally waste space # since we can't use [a,b] syntax to select arbitrary items from a list # we'll have to do this manually... # only remove the tables, if they have an image with an alt attribute # containing prev, next or team t = soup('table') if t: if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None): try: alt = t[0].img['alt'].lower() if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: t[0].extract() except: pass if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None): try: alt = t[-1].img['alt'].lower() if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: t[-1].extract() except: pass # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. # remove br at top of page if present after nav bars removed br = soup('br') if br: if check_all_prev_empty(br[0].previousSibling): br[0].extract() # some images seem to be broken in some chm's :/ base = os.path.dirname(htmlpath) for img in soup('img', src=True): src = img['src'] ipath = os.path.join(base, *src.split('/')) if os.path.exists(ipath): continue src = src.split(';')[0] if not src: continue ipath = os.path.join(base, *src.split('/')) if not os.path.exists(ipath): while src.startswith('../'): src = src[3:] img['src'] = src try: # if there is only a single table with a single element # in the body, replace it by the contents of this single element tables = soup.body.findAll('table', recursive=False) if tables and len(tables) == 1: trs = tables[0].findAll('tr', recursive=False) if trs and len(trs) == 1: tds = trs[0].findAll('td', recursive=False) if tds and len(tds) == 1: tdContents = tds[0].contents tableIdx = soup.body.contents.index(tables[0]) tables[0].extract() while tdContents: soup.body.insert(tableIdx, tdContents.pop()) except: pass # do not prettify, it would reformat the <pre> tags! try: ans = str(soup) self.re_encoded_files.add(os.path.abspath(htmlpath)) return ans except RuntimeError: return data
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup(comments) result = BeautifulSoup() rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if type(token) is NavigableString: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc,prepare_string_for_xml(token)) ptc += 1 elif type(token) in (CData, Comment, Declaration, ProcessingInstruction): continue elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: result.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 result.insert(rtc, token) rtc += 1 if open_pTag: result.insert(rtc, pTag) for p in result.findAll('p'): p['class'] = 'description' for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode(t))) return result.renderContents(encoding=None)
def to_HTML(self, header=''): ''' Generate HTML with user-specified CSS, element order ''' # Retrieve CSS prefs from calibre_plugins.marvin_manager.appearance import default_elements stored_css = plugin_prefs.get('appearance_css', default_elements) elements = [] for element in stored_css: elements.append(element['name']) if element['name'] == 'Note': note_style = re.sub('\n', '', element['css']) elif element['name'] == 'Text': text_style = re.sub('\n', '', element['css']) elif element['name'] == 'Timestamp': ts_style = re.sub('\n', '', element['css']) # Additional CSS for timestamp color and bg to be formatted datetime_style = ("background-color:{0};color:{1};" + ts_style) # Order the elements according to stored preferences comments_body = '' for element in elements: if element == 'Text': comments_body += '{text}' elif element == 'Note': comments_body += '{note}' elif element == 'Timestamp': ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}"> <tr> <td class="location" style="text-align:left">{location}</td> <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td> </tr> </table>''' comments_body += re.sub(r'>\s+<', r'><', ts_css) if self.annotations: soup = BeautifulSoup(ANNOTATIONS_HEADER) dtc = 0 # Add the annotations for i, agroup in enumerate( sorted(self.annotations, key=self._annotation_sorter)): location = agroup.location if location is None: location = '' friendly_timestamp = self._timestamp_to_datestr( agroup.timestamp) text = '' if agroup.text: for agt in agroup.text: text += '<p class="highlight" style="{0}">{1}</p>'.format( text_style, agt) note = '' if agroup.note: for agn in agroup.note: note += '<p class="note" style="{0}">{1}</p>'.format( note_style, agn) try: dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg'] dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg'] except: if agroup.highlightcolor is None: msg = "No highlight color specified, using Default" else: msg = "Unknown color '%s' specified" % agroup.highlightcolor self._log_location(msg) dt_bgcolor = COLOR_MAP['Default']['bg'] dt_fgcolor = COLOR_MAP['Default']['fg'] if agroup.hash is not None: # Use existing hash when re-rendering hash = agroup.hash else: m = hashlib.md5() m.update(text) m.update(note) hash = m.hexdigest() divTag = Tag(BeautifulSoup(), 'div') content_args = { 'color': agroup.highlightcolor, 'friendly_timestamp': friendly_timestamp, 'location': location, 'note': note, 'text': text, 'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor), 'unix_timestamp': agroup.timestamp, } divTag.insert(0, comments_body.format(**content_args)) divTag['class'] = "annotation" divTag['genre'] = '' if agroup.genre: divTag['genre'] = escape(agroup.genre) divTag['hash'] = hash divTag['location_sort'] = agroup.location_sort divTag['reader'] = agroup.reader_app divTag['style'] = ANNOTATION_DIV_STYLE soup.div.insert(dtc, divTag) dtc += 1 if i < len(self.annotations) - 1 and \ plugin_prefs.get('appearance_hr_checkbox', False): soup.div.insert( dtc, plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />')) dtc += 1 else: soup = BeautifulSoup(ANNOTATIONS_HEADER) return unicode(soup.renderContents())
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations"): ''' Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py ''' import calibre_plugins.annotations.config as cfg _log_location("%s -> %s" % (old_destination_field, new_destination_field)) db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] # Show progress pb = ProgressBar(parent=parent, window_title=window_title, on_top=True) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label('{:^100}'.format('Moving annotations for %d books' % total_books)) pb.show() transient_db = 'transient' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get('COMMENTS_DIVIDER', '· · • · ✦ · • · ·')) for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == 'Comments' and new_destination_field.startswith('#'): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to destination um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record with stripped Comments, populated custom field db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> Comments elif old_destination_field.startswith('#') and new_destination_field == 'Comments': if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> custom elif old_destination_field.startswith('#') and new_destination_field.startswith('#'): if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # same field -> same field - called from config:configure_appearance() elif (old_destination_field == new_destination_field): pb.set_label('{:^100}'.format('Updating annotations for %d books' % total_books)) if new_destination_field == 'Comments': if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() else: # Update custom field old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add stripped old_soup plus new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(old_soup) + unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # Hide the progress bar pb.hide() # Change field value to friendly name if old_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == new_destination_field: new_destination_field = cf break # Report what happened if old_destination_field == new_destination_field: msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map) else: msg = ("<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % (len(annotation_map), old_destination_field, new_destination_field)) if len(annotation_map) == 1: msg = msg.format('book') else: msg = msg.format('books') MessageBox(MessageBox.INFO, '', msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log_location() _log("INFO: %s" % msg) # Update the UI updateCalibreGUIView()
def move_annotations( parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations" ): """ Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py """ import calibre_plugins.marvin_manager.config as cfg _log_location(annotation_map) _log(" %s -> %s" % (old_destination_field, new_destination_field)) db = parent.opts.gui.current_db id = db.FIELD_MAP["id"] # Show progress pb = ProgressBar(parent=parent, window_title=window_title) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label("{:^100}".format("Moving annotations for %d books" % total_books)) pb.show() transient_db = "transient" # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get( "COMMENTS_DIVIDER", "· · • · ✦ · • · ·" ) ) for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == "Comments" and new_destination_field.startswith("#"): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find("div", "comments_divider") if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to destination um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record with stripped Comments, populated custom field db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # custom -> Comments elif old_destination_field.startswith("#") and new_destination_field == "Comments": if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um["#value#"] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # custom -> custom elif old_destination_field.startswith("#") and new_destination_field.startswith("#"): if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um["#value#"] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add new_soup to destination field um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # same field -> same field - called from config:configure_appearance() elif old_destination_field == new_destination_field: pb.set_label("{:^100}".format("Updating annotations for %d books" % total_books)) if new_destination_field == "Comments": if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find("div", "comments_divider") if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() else: # Update custom field old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add stripped old_soup plus new_soup to destination field um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(old_soup) + unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # Hide the progress bar pb.hide() # Get the eligible custom fields all_custom_fields = db.custom_field_keys() custom_fields = {} for cf in all_custom_fields: field_md = db.metadata_for_field(cf) if field_md["datatype"] in ["comments"]: custom_fields[field_md["name"]] = {"field": cf, "datatype": field_md["datatype"]} # Change field value to friendly name if old_destination_field.startswith("#"): for cf in custom_fields: if custom_fields[cf]["field"] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith("#"): for cf in custom_fields: if custom_fields[cf]["field"] == new_destination_field: new_destination_field = cf break # Report what happened if old_destination_field == new_destination_field: msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map) else: msg = "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % ( len(annotation_map), old_destination_field, new_destination_field, ) if len(annotation_map) == 1: msg = msg.format("book") else: msg = msg.format("books") MessageBox(MessageBox.INFO, "", msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log("INFO: %s" % msg) # Update the UI updateCalibreGUIView()
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations"): ''' Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py ''' import calibre_plugins.marvin_manager.config as cfg _log_location(annotation_map) _log(" %s -> %s" % (old_destination_field, new_destination_field)) db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] # Show progress pb = ProgressBar(parent=parent, window_title=window_title) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label('{:^100}'.format('Moving annotations for %d books' % total_books)) pb.show() transient_db = 'transient' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get( 'COMMENTS_DIVIDER', '· · • · ✦ · • · ·' )) for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == 'Comments' and new_destination_field.startswith( '#'): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html( transient_db, cid) # Add user_annotations to destination um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record with stripped Comments, populated custom field db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> Comments elif old_destination_field.startswith( '#') and new_destination_field == 'Comments': if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html( transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> custom elif old_destination_field.startswith( '#') and new_destination_field.startswith('#'): if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html( transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # same field -> same field - called from config:configure_appearance() elif (old_destination_field == new_destination_field): pb.set_label('{:^100}'.format('Updating annotations for %d books' % total_books)) if new_destination_field == 'Comments': if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html( transient_db, cid) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() else: # Update custom field old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html( transient_db, cid) # Add stripped old_soup plus new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(old_soup) + unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # Hide the progress bar pb.hide() # Get the eligible custom fields all_custom_fields = db.custom_field_keys() custom_fields = {} for cf in all_custom_fields: field_md = db.metadata_for_field(cf) if field_md['datatype'] in ['comments']: custom_fields[field_md['name']] = { 'field': cf, 'datatype': field_md['datatype'] } # Change field value to friendly name if old_destination_field.startswith('#'): for cf in custom_fields: if custom_fields[cf]['field'] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith('#'): for cf in custom_fields: if custom_fields[cf]['field'] == new_destination_field: new_destination_field = cf break # Report what happened if old_destination_field == new_destination_field: msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len( annotation_map) else: msg = ( "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % (len(annotation_map), old_destination_field, new_destination_field)) if len(annotation_map) == 1: msg = msg.format('book') else: msg = msg.format('books') MessageBox(MessageBox.INFO, '', msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log("INFO: %s" % msg) # Update the UI updateCalibreGUIView()
def generate_html(comments): args = dict( xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_("Published"), pubdate=pubdate, series_label=_("Series"), series=series, rating_label=_("Rating"), rating=rating, tags_label=_("Tags"), tags=tags, comments=comments, footer="", ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace("#", "_") args[key] = escape(val) args[key + "_label"] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith("_") and not key.endswith("_label"): print(" %s: %s" % ("#" + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args["_genre_label"] = args.get("_genre_label", "{_genre_label}") args["_genre"] = args.get("_genre", "{_genre}") formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={"class": "cbj_series"}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={"class": "cbj_rating"}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={"class": "cbj_tags"}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={"class": "cbj_pubdata"}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != "kindle": hr_tag = soup.find("hr", attrs={"class": "cbj_kindle_banner_hr"}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations(soup.renderContents("utf-8").decode("utf-8"))
def _reformat(self, data, htmlpath): if self.input_encoding: data = data.decode(self.input_encoding) try: data = xml_to_unicode(data, strip_encoding_pats=True)[0] soup = BeautifulSoup(data) except ValueError: # hit some strange encoding problems... self.log.exception("Unable to parse html for cleaning, leaving it") return data # nuke javascript... [s.extract() for s in soup('script')] # See if everything is inside a <head> tag # https://bugs.launchpad.net/bugs/1273512 body = soup.find('body') if body is not None and body.parent.name == 'head': html = soup.find('html') html.insert(len(html), body) # remove forward and back nav bars from the top/bottom of each page # cos they really f**k with the flow of things and generally waste space # since we can't use [a,b] syntax to select arbitrary items from a list # we'll have to do this manually... # only remove the tables, if they have an image with an alt attribute # containing prev, next or team t = soup('table') if t: if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None): try: alt = t[0].img['alt'].lower() if alt.find('prev') != -1 or alt.find( 'next') != -1 or alt.find('team') != -1: t[0].extract() except: pass if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None): try: alt = t[-1].img['alt'].lower() if alt.find('prev') != -1 or alt.find( 'next') != -1 or alt.find('team') != -1: t[-1].extract() except: pass # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. # remove br at top of page if present after nav bars removed br = soup('br') if br: if check_all_prev_empty(br[0].previousSibling): br[0].extract() # some images seem to be broken in some chm's :/ base = os.path.dirname(htmlpath) for img in soup('img', src=True): src = img['src'] ipath = os.path.join(base, *src.split('/')) if os.path.exists(ipath): continue src = src.split(';')[0] if not src: continue ipath = os.path.join(base, *src.split('/')) if not os.path.exists(ipath): while src.startswith('../'): src = src[3:] img['src'] = src try: # if there is only a single table with a single element # in the body, replace it by the contents of this single element tables = soup.body.findAll('table', recursive=False) if tables and len(tables) == 1: trs = tables[0].findAll('tr', recursive=False) if trs and len(trs) == 1: tds = trs[0].findAll('td', recursive=False) if tds and len(tds) == 1: tdContents = tds[0].contents tableIdx = soup.body.contents.index(tables[0]) tables[0].extract() while tdContents: soup.body.insert(tableIdx, tdContents.pop()) except: pass # do not prettify, it would reformat the <pre> tags! try: ans = str(soup) self.re_encoded_files.add(os.path.abspath(htmlpath)) return ans except RuntimeError: return data
def start_fetch(self, url): soup = BeautifulSoup(u'<a href="' + url + '" />') res = self.process_links(soup, url, 0, into_dir='') self.log.debug(url, 'saved to', res) return res
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def update_results(self, trigger): #self._log_location(trigger) reader_to_match = str(self.find_annotations_reader_comboBox.currentText()) color_to_match = str(self.find_annotations_color_comboBox.currentText()) text_to_match = str(self.find_annotations_text_lineEdit.text()) note_to_match = str(self.find_annotations_note_lineEdit.text()) from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t() to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t() annotation_map = self.annotated_books_scanner.annotation_map #field = self.prefs.get("cfg_annotations_destination_field", None) field = get_cc_mapping('annotations', 'field', None) db = self.opts.gui.current_db matched_titles = [] self.matched_ids = set() for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) soup = None if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: if mi.get_user_metadata(field, False)['#value#'] is not None: soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#']) if soup: uas = soup.findAll('div', 'annotation') for ua in uas: # Are we already logged? if cid in self.matched_ids: continue # Check reader if reader_to_match != self.GENERIC_READER: this_reader = ua['reader'] if this_reader != reader_to_match: continue # Check color if color_to_match != self.GENERIC_STYLE: this_color = ua.find('table')['color'] if this_color != color_to_match: continue # Check date range, allow for mangled timestamp try: timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < from_date or timestamp > to_date: continue except: continue highlight_text = '' try: pels = ua.findAll('p', 'highlight') for pel in pels: highlight_text += pel.string + '\n' except: pass if text_to_match > '': if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE): continue note_text = '' try: nels = ua.findAll('p', 'note') for nel in nels: note_text += nel.string + '\n' except: pass if note_to_match > '': if not re.search(note_to_match, note_text, flags=re.IGNORECASE): continue # If we made it this far, add the id to matched_ids self.matched_ids.add(cid) matched_titles.append(mi.title) # Update the results box matched_titles.sort() if len(annotation_map): if len(matched_titles): first_match = ("<i>%s</i>" % matched_titles[0]) if len(matched_titles) == 1: results = first_match else: results = first_match + (" and %d more." % (len(matched_titles) - 1)) self.result_label.setText('<p style="color:blue">{0}</p>'.format(results)) else: self.result_label.setText('<p style="color:red">no matches</p>') else: self.result_label.setText('<p style="color:red">no annotated books in library</p>') self.resize_dialog()
def preprocess_html(self, soup): if self.webEdition & (self.oldest_article>0): date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if date_tag: date_str = self.tag_to_string(date_tag,use_alt=False) date_str = date_str.replace('Published:','') date_items = date_str.split(',') try: datestring = date_items[0]+' '+date_items[1] article_date = self.decode_us_date(datestring) except: article_date = date.today() if article_date < self.earliest_date: self.log("Skipping article dated %s" % date_str) return None #all articles are from today, no need to print the date on every page try: if not self.webEdition: date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if date_tag: date_tag.extract() except: self.log("Error removing the published date") if self.useHighResImages: try: #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: popupreflink = popupref.find('a') if popupreflink: reflinkstring = str(popupreflink['href']) refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") refend = reflinkstring.find(".html", refstart) + len(".html") reflinkstring = reflinkstring[refstart:refend] popuppage = self.browser.open(reflinkstring) popuphtml = popuppage.read() popuppage.close() if popuphtml: st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] popupSoup = BeautifulSoup(popuphtml) highResTag = popupSoup.find('img', {'src':highResImageLink}) if highResTag: try: newWidth = highResTag['width'] newHeight = highResTag['height'] imageTag = popupref.parent.find("img") except: self.log("Error: finding width and height of img") popupref.extract() if imageTag: try: imageTag['src'] = highResImageLink imageTag['width'] = newWidth imageTag['height'] = newHeight except: self.log("Error setting the src width and height parameters") except Exception: self.log("Error pulling high resolution images") try: #remove "Related content" bar runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']}) if runAroundsFound: for runAround in runAroundsFound: #find all section headers hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']}) if hlines: for hline in hlines: hline.extract() #find all section headers hlines = runAround.findAll('h6') if hlines: for hline in hlines: hline.extract() except: self.log("Error removing related content bar") try: #in case pulling images failed, delete the enlarge this text enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: popupref.extract() except: self.log("Error removing Enlarge this text") return self.strip_anchors(soup)
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' debug_print("merge_annotations - cid=", cid) debug_print("merge_annotations - old_soup=", old_soup) debug_print("merge_annotations - new_soup=", new_soup) # Fetch preferred merge index technique merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash') if merge_index == 'hash': # Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) debug_print("old hashes=", old_hashes) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: debug_print("Getting old annotations - count=", len(ouas)) debug_print("Getting old annotations - old_soup=", old_soup) debug_print("Getting old annotations - ouas=", ouas) ouas.extract() debug_print("Getting old annotations - ouas after extract=", ouas) debug_print("Getting old annotations - old_soup after extract=", old_soup) # Capture existing annotations annotation_list = parent.opts.db.capture_content( ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html_from_list(annotation_list)) debug_print("Getting old annotations - regurgitated_soup=", regurgitated_soup) else: regurgitated_soup = BeautifulSoup() # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) debug_print("new_hashes=", sorted(new_hashes)) debug_print("old hashes=", sorted(old_hashes)) debug_print("new_hashes.difference(old_hashes)=", new_hashes.difference(old_hashes)) updates = list(new_hashes.difference(old_hashes)) debug_print("differences between old and new hashs - updates=", updates) if ouas is not None: if len(updates): debug_print("have updates and ouas") # Append new to regurgitated dtc = len(regurgitated_soup.div) debug_print("length regurgitated_soup - dtc=", dtc) for new_annotation_id in updates: debug_print( "extending regurgitated_soup - new_annotation_id=", new_annotation_id) new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 merged_soup = unicode(sort_merged_annotations(regurgitated_soup)) else: debug_print("have updates and ouas") if not regurgitated_soup == BeautifulSoup(): debug_print("adding old_soup and new_soup") debug_print("unicode(regurgitated_soup)=", unicode(regurgitated_soup)) debug_print("unicode(new_soup)=", unicode(new_soup)) merged_soup = unicode(regurgitated_soup) + unicode(new_soup) else: debug_print("just new_soup") merged_soup = unicode(new_soup) debug_print("merged_soup=", merged_soup) return merged_soup elif merge_index == 'timestamp': timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: try: timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} except: continue # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations annotation_list = parent.opts.db.capture_content( ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html_from_list(annotation_list)) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: try: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} except: print("ERROR: malformed timestamp in device annotation") print(dua.prettify()) merged_soup = BeautifulSoup(ANNOTATIONS_HEADER) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts][ 'device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_soup.div.append(annotation) return unicode(sort_merged_annotations(merged_soup))
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey+'_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))