def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"]) images = soup.findAll("img") if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover": img = images[0] img = os.path.join(base, *img["src"].split("/")) if os.path.exists(img): return open(img, "rb").read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find("body") if body is not None: text = u"".join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll("img", src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]["src"].split("/")) if os.path.exists(img): return open(img, "rb").read()
def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'font', 'br']) images = soup.findAll('img') if matches is None and len(images) == 1 and \ images[0].get('alt', '')=='cover': img = images[0] img = os.path.join(base, *img['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find('body') if body is not None: text = u''.join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll('img', src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read()
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='' ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def existing_annotations(parent, field, return_all=False): ''' Return count of existing annotations, or existence of any ''' import calibre_plugins.annotations.config as cfg annotation_map = [] if field: db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] for i, record in enumerate(db.data.iterall()): mi = db.get_metadata(record[id], index_is_id=True) if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#']) if soup.find('div', 'user_annotations') is not None: annotation_map.append(mi.id) if not return_all: break if return_all: _log_location("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data))) return annotation_map
def save_soup(soup, target): ns = BeautifulSoup('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') nm = ns.find('meta') metas = soup.findAll('meta', content=True) added = False for meta in metas: if 'charset' in meta.get('content', '').lower(): meta.replaceWith(nm) added = True if not added: head = soup.find('head') if head is not None: head.insert(0, nm) selfdir = os.path.dirname(target) for tag in soup.findAll(['img', 'link', 'a']): for key in ('src', 'href'): path = tag.get(key, None) if path and os.path.isfile(path) and os.path.exists(path) and os.path.isabs(path): tag[key] = unicode_path(relpath(path, selfdir).replace(os.sep, '/')) html = unicode(soup) with open(target, 'wb') as f: f.write(html.encode('utf-8'))
def existing_annotations(parent, field, return_all=False): """ Return count of existing annotations, or existence of any """ # import calibre_plugins.marvin_manager.config as cfg _log_location(field) annotation_map = [] if field: db = parent.opts.gui.current_db id = db.FIELD_MAP["id"] for i, record in enumerate(db.data.iterall()): mi = db.get_metadata(record[id], index_is_id=True) if field == "Comments": if mi.comments: soup = BeautifulSoup(mi.comments) else: continue else: soup = BeautifulSoup(mi.get_user_metadata(field, False)["#value#"]) if soup.find("div", "user_annotations") is not None: annotation_map.append(mi.id) if not return_all: break if return_all: _log("Identified %d annotated books of %d total books" % (len(annotation_map), len(db.data))) _log("annotation_map: %s" % repr(annotation_map)) else: _log("no active field") return annotation_map
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class':'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class':'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def find_all_annotated_books(self): ''' Find all annotated books in library ''' self._log_location("field: {0}".format(self.field)) cids = self.cdb.search_getting_ids('formats:EPUB', '') for cid in cids: mi = self.cdb.get_metadata(cid, index_is_id=True) raw = mi.get_user_metadata(self.field, False) if raw['#value#'] is not None: soup = BeautifulSoup(raw['#value#']) if soup.find('div', 'user_annotations') is not None: self.annotation_map.append(mi.id)
def _remove_old_style(self, html): ''' Remove the old style tag, finalize soup in preparation for styling ''' unstyled_soup = BeautifulSoup(html) head = unstyled_soup.find("head") voc = unstyled_soup.body.find('div', {'class': 'vocabulary'}) tds = voc.findAll(lambda tag: tag.name == 'td' and tag.a) dart = random.randrange(len(tds)) self.td = tds[dart] self.oh = self.td.a['href'] self.td.a['href'] = self._finalize() old_style = head.find('style') if old_style: old_style.extract() return unstyled_soup
def _inject_css(self, html): ''' stick a <style> element into html ''' css = self.prefs.get('injected_css', None) if css: try: styled_soup = BeautifulSoup(html) head = styled_soup.find("head") style_tag = Tag(styled_soup, 'style') style_tag['type'] = "text/css" style_tag.insert(0, css) head.insert(0, style_tag) html = styled_soup.renderContents() except: return html return(html)
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode_type): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup('<div>' + comments + '</div>').find('div') result = BeautifulSoup('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): continue if isinstance(token, NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents().replace('<br></br>', '<br>')
def move_annotations( parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations" ): """ Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py """ import calibre_plugins.marvin_manager.config as cfg _log_location(annotation_map) _log(" %s -> %s" % (old_destination_field, new_destination_field)) db = parent.opts.gui.current_db id = db.FIELD_MAP["id"] # Show progress pb = ProgressBar(parent=parent, window_title=window_title) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label("{:^100}".format("Moving annotations for %d books" % total_books)) pb.show() transient_db = "transient" # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get( "COMMENTS_DIVIDER", "· · • · ✦ · • · ·" ) ) for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == "Comments" and new_destination_field.startswith("#"): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find("div", "comments_divider") if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to destination um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record with stripped Comments, populated custom field db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # custom -> Comments elif old_destination_field.startswith("#") and new_destination_field == "Comments": if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um["#value#"] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # custom -> custom elif old_destination_field.startswith("#") and new_destination_field.startswith("#"): if mi.get_user_metadata(old_destination_field, False)["#value#"] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um["#value#"] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add new_soup to destination field um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # same field -> same field - called from config:configure_appearance() elif old_destination_field == new_destination_field: pb.set_label("{:^100}".format("Updating annotations for %d books" % total_books)) if new_destination_field == "Comments": if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find("div", "comments_divider") if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + unicode(comments_divider) + unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() else: # Update custom field old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)["#value#"]) uas = old_soup.find("div", "user_annotations") if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add stripped old_soup plus new_soup to destination field um = mi.metadata_for_field(new_destination_field) um["#value#"] = unicode(old_soup) + unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata( cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True ) pb.increment() # Hide the progress bar pb.hide() # Get the eligible custom fields all_custom_fields = db.custom_field_keys() custom_fields = {} for cf in all_custom_fields: field_md = db.metadata_for_field(cf) if field_md["datatype"] in ["comments"]: custom_fields[field_md["name"]] = {"field": cf, "datatype": field_md["datatype"]} # Change field value to friendly name if old_destination_field.startswith("#"): for cf in custom_fields: if custom_fields[cf]["field"] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith("#"): for cf in custom_fields: if custom_fields[cf]["field"] == new_destination_field: new_destination_field = cf break # Report what happened if old_destination_field == new_destination_field: msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map) else: msg = "<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % ( len(annotation_map), old_destination_field, new_destination_field, ) if len(annotation_map) == 1: msg = msg.format("book") else: msg = msg.format("books") MessageBox(MessageBox.INFO, "", msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log("INFO: %s" % msg) # Update the UI updateCalibreGUIView()
def _reformat(self, data, htmlpath): if self.input_encoding: data = data.decode(self.input_encoding) try: data = xml_to_unicode(data, strip_encoding_pats=True)[0] soup = BeautifulSoup(data) except ValueError: # hit some strange encoding problems... self.log.exception("Unable to parse html for cleaning, leaving it") return data # nuke javascript... [s.extract() for s in soup('script')] # See if everything is inside a <head> tag # https://bugs.launchpad.net/bugs/1273512 body = soup.find('body') if body is not None and body.parent.name == 'head': html = soup.find('html') html.insert(len(html), body) # remove forward and back nav bars from the top/bottom of each page # cos they really f**k with the flow of things and generally waste space # since we can't use [a,b] syntax to select arbitrary items from a list # we'll have to do this manually... # only remove the tables, if they have an image with an alt attribute # containing prev, next or team t = soup('table') if t: if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None): try: alt = t[0].img['alt'].lower() if alt.find('prev') != -1 or alt.find( 'next') != -1 or alt.find('team') != -1: t[0].extract() except: pass if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None): try: alt = t[-1].img['alt'].lower() if alt.find('prev') != -1 or alt.find( 'next') != -1 or alt.find('team') != -1: t[-1].extract() except: pass # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. # remove br at top of page if present after nav bars removed br = soup('br') if br: if check_all_prev_empty(br[0].previousSibling): br[0].extract() # some images seem to be broken in some chm's :/ base = os.path.dirname(htmlpath) for img in soup('img', src=True): src = img['src'] ipath = os.path.join(base, *src.split('/')) if os.path.exists(ipath): continue src = src.split(';')[0] if not src: continue ipath = os.path.join(base, *src.split('/')) if not os.path.exists(ipath): while src.startswith('../'): src = src[3:] img['src'] = src try: # if there is only a single table with a single element # in the body, replace it by the contents of this single element tables = soup.body.findAll('table', recursive=False) if tables and len(tables) == 1: trs = tables[0].findAll('tr', recursive=False) if trs and len(trs) == 1: tds = trs[0].findAll('td', recursive=False) if tds and len(tds) == 1: tdContents = tds[0].contents tableIdx = soup.body.contents.index(tables[0]) tables[0].extract() while tdContents: soup.body.insert(tableIdx, tdContents.pop()) except: pass # do not prettify, it would reformat the <pre> tags! try: ans = str(soup) self.re_encoded_files.add(os.path.abspath(htmlpath)) return ans except RuntimeError: return data
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='') for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key + '_label'] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class': 'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class': 'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class': 'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def generate_html(comments): args = dict( xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join( escape(t) + 'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars( mi.get(key), m.get('display', {}).get('allow_half_stars', False)) else: args[dkey] = escape(val) args[dkey + '_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in list(args.keys()): if key.startswith('_') and not key.endswith('_label'): print((" %s: %s" % ('#' + key[1:], args[key]))) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class': 'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class': 'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class': 'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' if False: ''' Older technique: Use hashes to merge annotations ''' #Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode(sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup else: ''' Newer technique: Use timestamps to merge annotations ''' timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: #print("sua: %s" % sua.prettify()) timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate annotations with current CSS rerendered_annotations = parent.opts.db.rerender_to_html(TRANSIENT_DB, cid) regurgitated_soup = BeautifulSoup(rerendered_annotations) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} merged_annotations = Tag(BeautifulSoup(), 'div', [('class', "user_annotations"), ('style','margin:0')]) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_annotations.append(annotation) merged_annotations = sort_merged_annotations(merged_annotations) # Update new_soup with merged_annotations new_soup_uas = new_soup.find('div', 'user_annotations') new_soup_uas.replaceWith(merged_annotations) return unicode(new_soup)
def _reformat(self, data, htmlpath): if self.input_encoding: data = data.decode(self.input_encoding) try: data = xml_to_unicode(data, strip_encoding_pats=True)[0] soup = BeautifulSoup(data) except ValueError: # hit some strange encoding problems... self.log.exception("Unable to parse html for cleaning, leaving it") return data # nuke javascript... [s.extract() for s in soup('script')] # See if everything is inside a <head> tag # https://bugs.launchpad.net/bugs/1273512 body = soup.find('body') if body is not None and body.parent.name == 'head': html = soup.find('html') html.insert(len(html), body) # remove forward and back nav bars from the top/bottom of each page # cos they really f**k with the flow of things and generally waste space # since we can't use [a,b] syntax to select arbitrary items from a list # we'll have to do this manually... # only remove the tables, if they have an image with an alt attribute # containing prev, next or team t = soup('table') if t: if (t[0].previousSibling is None or t[0].previousSibling.previousSibling is None): try: alt = t[0].img['alt'].lower() if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: t[0].extract() except: pass if (t[-1].nextSibling is None or t[-1].nextSibling.nextSibling is None): try: alt = t[-1].img['alt'].lower() if alt.find('prev') != -1 or alt.find('next') != -1 or alt.find('team') != -1: t[-1].extract() except: pass # for some very odd reason each page's content appears to be in a table # too. and this table has sub-tables for random asides... grr. # remove br at top of page if present after nav bars removed br = soup('br') if br: if check_all_prev_empty(br[0].previousSibling): br[0].extract() # some images seem to be broken in some chm's :/ base = os.path.dirname(htmlpath) for img in soup('img', src=True): src = img['src'] ipath = os.path.join(base, *src.split('/')) if os.path.exists(ipath): continue src = src.split(';')[0] if not src: continue ipath = os.path.join(base, *src.split('/')) if not os.path.exists(ipath): while src.startswith('../'): src = src[3:] img['src'] = src try: # if there is only a single table with a single element # in the body, replace it by the contents of this single element tables = soup.body.findAll('table', recursive=False) if tables and len(tables) == 1: trs = tables[0].findAll('tr', recursive=False) if trs and len(trs) == 1: tds = trs[0].findAll('td', recursive=False) if tds and len(tds) == 1: tdContents = tds[0].contents tableIdx = soup.body.contents.index(tables[0]) tables[0].extract() while tdContents: soup.body.insert(tableIdx, tdContents.pop()) except: pass # do not prettify, it would reformat the <pre> tags! try: ans = str(soup) self.re_encoded_files.add(os.path.abspath(htmlpath)) return ans except RuntimeError: return data
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return '<p></p>' if not isinstance(comments, str): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [ '<p class="description">%s</p>' % x.replace('\n', '<br />') for x in comments.split('\n\n') ] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return '<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub( lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace( lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace('\r', '') # Convert \n\n to <p>s comments = comments.replace('\n\n', '<p>') # Convert solo returns to <br /> comments = comments.replace('\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup('<div>' + comments + '</div>').find('div') result = BeautifulSoup('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr') for token in all_tokens: if isinstance(token, (CData, Comment, Declaration, ProcessingInstruction)): continue if isinstance(token, NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in inline_tags: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents()
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' # Fetch preferred merge index technique merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash') if merge_index == 'hash': # Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode(sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup elif merge_index == 'timestamp': timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: try: timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} except: continue # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: try: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} except: print("ERROR: malformed timestamp in device annotation") print(dua.prettify()) merged_soup = BeautifulSoup(ANNOTATIONS_HEADER) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_soup.div.append(annotation) return unicode(sort_merged_annotations(merged_soup))
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field, window_title=_("Moving annotations")): ''' Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py ''' import calibre_plugins.annotations.config as cfg _log_location("%s -> %s" % (old_destination_field, new_destination_field)) library_db = parent.opts.gui.current_db id = library_db.FIELD_MAP['id'] # Show progress pb = ProgressBar(parent=parent, window_title=window_title, on_top=True) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label('{:^100}'.format('%s for %d books' % (window_title, total_books))) pb.show() id_map_old_destination_field = {} id_map_new_destination_field = {} transient_db = 'transient' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get( 'COMMENTS_DIVIDER', '· · • · ✦ · • · ·' )) for cid in annotation_map: mi = library_db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == 'Comments' and new_destination_field.startswith( '#'): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Capture content annotation_list = parent.opts.db.capture_content( uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html_from_list( annotation_list) id_map_old_destination_field[cid] = unicode(old_soup) id_map_new_destination_field[cid] = unicode(new_soup) pb.increment() # custom -> Comments elif old_destination_field.startswith( '#') and new_destination_field == 'Comments': if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from custom field uas.extract() # Capture content annotation_list = parent.opts.db.capture_content( uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html_from_list( annotation_list) # Add user_annotations to Comments new_comments = '' if mi.comments is None: new_comments = unicode(new_soup) else: new_comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # # Update the record with stripped custom field, updated Comments # library_db.set_metadata(cid, mi, set_title=False, set_authors=False, # commit=True, force_changes=True, notify=True) id_map_old_destination_field[cid] = unicode(old_soup) id_map_new_destination_field[cid] = new_comments pb.increment() # custom -> custom elif old_destination_field.startswith( '#') and new_destination_field.startswith('#'): if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content annotation_list = parent.opts.db.capture_content( uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html_from_list( annotation_list) id_map_old_destination_field[cid] = unicode(old_soup) id_map_new_destination_field[cid] = unicode(new_soup) pb.increment() # same field -> same field - called from config:configure_appearance() elif (old_destination_field == new_destination_field): pb.set_label('{:^100}'.format( _('Updating annotations for {0} books').format(total_books))) if new_destination_field == 'Comments': if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content annotation_list = parent.opts.db.capture_content( uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html_from_list( annotation_list) # Add user_annotations to Comments new_comments = '' if mi.comments is None: new_comments = unicode(new_soup) else: new_comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments # library_db.set_metadata(cid, mi, set_title=False, set_authors=False, # commit=True, force_changes=True, notify=True) id_map_old_destination_field[cid] = unicode(old_soup) id_map_new_destination_field[cid] = unicode(new_soup) pb.increment() else: # Update custom field old_soup = BeautifulSoup( mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content annotation_list = parent.opts.db.capture_content( uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html_from_list( annotation_list) # # Add stripped old_soup plus new_soup to destination field # um = mi.metadata_for_field(new_destination_field) # um['#value#'] = unicode(old_soup) + unicode(new_soup) # mi.set_user_metadata(new_destination_field, um) # # # Update the record # library_db.set_metadata(cid, mi, set_title=False, set_authors=False, # commit=True, force_changes=True, notify=True) id_map_old_destination_field[cid] = unicode(old_soup) id_map_new_destination_field[cid] = unicode(new_soup) pb.increment() if len(id_map_old_destination_field) > 0: debug_print( "move_annotations - Updating metadata - for column: %s number of changes=%d" % (old_destination_field, len(id_map_old_destination_field))) library_db.new_api.set_field(old_destination_field.lower(), id_map_old_destination_field) if len(id_map_new_destination_field) > 0: debug_print( "move_annotations - Updating metadata - for column: %s number of changes=%d" % (new_destination_field, len(id_map_new_destination_field))) library_db.new_api.set_field(new_destination_field.lower(), id_map_new_destination_field) # Hide the progress bar pb.hide() # Change field value to friendly name if old_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == new_destination_field: new_destination_field = cf break # Report what happened if len(annotation_map) == 1: book_word = _('book') else: book_word = _('books') if old_destination_field == new_destination_field: msg = _( "Annotations updated to new appearance settings for {0} {1}.</p>" ).format(len(annotation_map), book_word) else: msg = _("Annotations for {0} {1} moved from <b>{2}</b> to <b>{3}</b>." ).format(len(annotation_map), book_word, old_destination_field, new_destination_field) msg = "<p>{0}</p>".format(msg) MessageBox(MessageBox.INFO, '', msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log_location() _log("INFO: %s" % msg)
def generate_html(comments): args = dict( xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_("Published"), pubdate=pubdate, series_label=_("Series"), series=series, rating_label=_("Rating"), rating=rating, tags_label=_("Tags"), tags=tags, comments=comments, footer="", ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace("#", "_") args[key] = escape(val) args[key + "_label"] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith("_") and not key.endswith("_label"): print(" %s: %s" % ("#" + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args["_genre_label"] = args.get("_genre_label", "{_genre_label}") args["_genre"] = args.get("_genre", "{_genre}") formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={"class": "cbj_series"}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={"class": "cbj_rating"}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={"class": "cbj_tags"}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={"class": "cbj_pubdata"}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != "kindle": hr_tag = soup.find("hr", attrs={"class": "cbj_kindle_banner_hr"}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations(soup.renderContents("utf-8").decode("utf-8"))
def move_annotations(parent, annotation_map, old_destination_field, new_destination_field, window_title="Moving annotations"): ''' Move annotations from old_destination_field to new_destination_field annotation_map precalculated in thread in config.py ''' import calibre_plugins.annotations.config as cfg _log_location("%s -> %s" % (old_destination_field, new_destination_field)) db = parent.opts.gui.current_db id = db.FIELD_MAP['id'] # Show progress pb = ProgressBar(parent=parent, window_title=window_title, on_top=True) total_books = len(annotation_map) pb.set_maximum(total_books) pb.set_value(1) pb.set_label('{:^100}'.format('Moving annotations for %d books' % total_books)) pb.show() transient_db = 'transient' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( cfg.plugin_prefs.get('COMMENTS_DIVIDER', '· · • · ✦ · • · ·')) for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) # Comments -> custom if old_destination_field == 'Comments' and new_destination_field.startswith('#'): if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to destination um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record with stripped Comments, populated custom field db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> Comments elif old_destination_field.startswith('#') and new_destination_field == 'Comments': if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # custom -> custom elif old_destination_field.startswith('#') and new_destination_field.startswith('#'): if mi.get_user_metadata(old_destination_field, False)['#value#'] is not None: old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Save stripped custom field data um = mi.metadata_for_field(old_destination_field) um['#value#'] = unicode(old_soup) mi.set_user_metadata(old_destination_field, um) # Add new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # same field -> same field - called from config:configure_appearance() elif (old_destination_field == new_destination_field): pb.set_label('{:^100}'.format('Updating annotations for %d books' % total_books)) if new_destination_field == 'Comments': if mi.comments: old_soup = BeautifulSoup(mi.comments) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from Comments uas.extract() # Remove comments_divider from Comments cd = old_soup.find('div', 'comments_divider') if cd: cd.extract() # Save stripped Comments mi.comments = unicode(old_soup) # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add user_annotations to Comments if mi.comments is None: mi.comments = unicode(new_soup) else: mi.comments = mi.comments + \ unicode(comments_divider) + \ unicode(new_soup) # Update the record with stripped custom field, updated Comments db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() else: # Update custom field old_soup = BeautifulSoup(mi.get_user_metadata(old_destination_field, False)['#value#']) uas = old_soup.find('div', 'user_annotations') if uas: # Remove user_annotations from originating custom field uas.extract() # Capture content parent.opts.db.capture_content(uas, cid, transient_db) # Regurgitate content with current CSS style new_soup = parent.opts.db.rerender_to_html(transient_db, cid) # Add stripped old_soup plus new_soup to destination field um = mi.metadata_for_field(new_destination_field) um['#value#'] = unicode(old_soup) + unicode(new_soup) mi.set_user_metadata(new_destination_field, um) # Update the record db.set_metadata(cid, mi, set_title=False, set_authors=False, commit=True, force_changes=True, notify=True) pb.increment() # Hide the progress bar pb.hide() # Change field value to friendly name if old_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == old_destination_field: old_destination_field = cf break if new_destination_field.startswith('#'): for cf in parent.custom_fields: if parent.custom_fields[cf]['field'] == new_destination_field: new_destination_field = cf break # Report what happened if old_destination_field == new_destination_field: msg = "<p>Annotations updated to new appearance settings for %d {0}.</p>" % len(annotation_map) else: msg = ("<p>Annotations for %d {0} moved from <b>%s</b> to <b>%s</b>.</p>" % (len(annotation_map), old_destination_field, new_destination_field)) if len(annotation_map) == 1: msg = msg.format('book') else: msg = msg.format('books') MessageBox(MessageBox.INFO, '', msg=msg, show_copy_button=False, parent=parent.gui).exec_() _log_location() _log("INFO: %s" % msg) # Update the UI updateCalibreGUIView()
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey+'_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def preprocess_html(self, soup): if self.webEdition & (self.oldest_article>0): date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if date_tag: date_str = self.tag_to_string(date_tag,use_alt=False) date_str = date_str.replace('Published:','') date_items = date_str.split(',') try: datestring = date_items[0]+' '+date_items[1] article_date = self.decode_us_date(datestring) except: article_date = date.today() if article_date < self.earliest_date: self.log("Skipping article dated %s" % date_str) return None #all articles are from today, no need to print the date on every page try: if not self.webEdition: date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if date_tag: date_tag.extract() except: self.log("Error removing the published date") if self.useHighResImages: try: #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: popupreflink = popupref.find('a') if popupreflink: reflinkstring = str(popupreflink['href']) refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('") refend = reflinkstring.find(".html", refstart) + len(".html") reflinkstring = reflinkstring[refstart:refend] popuppage = self.browser.open(reflinkstring) popuphtml = popuppage.read() popuppage.close() if popuphtml: st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4] popupSoup = BeautifulSoup(popuphtml) highResTag = popupSoup.find('img', {'src':highResImageLink}) if highResTag: try: newWidth = highResTag['width'] newHeight = highResTag['height'] imageTag = popupref.parent.find("img") except: self.log("Error: finding width and height of img") popupref.extract() if imageTag: try: imageTag['src'] = highResImageLink imageTag['width'] = newWidth imageTag['height'] = newHeight except: self.log("Error setting the src width and height parameters") except Exception: self.log("Error pulling high resolution images") try: #remove "Related content" bar runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline','articleInline runaroundLeft ','articleInline runaroundLeft lastArticleInline']}) if runAroundsFound: for runAround in runAroundsFound: #find all section headers hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']}) if hlines: for hline in hlines: hline.extract() #find all section headers hlines = runAround.findAll('h6') if hlines: for hline in hlines: hline.extract() except: self.log("Error removing related content bar") try: #in case pulling images failed, delete the enlarge this text enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'}) if enlargeThisList: for popupref in enlargeThisList: popupref.extract() except: self.log("Error removing Enlarge this text") return self.strip_anchors(soup)
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' debug_print("merge_annotations - cid=", cid) debug_print("merge_annotations - old_soup=", old_soup) debug_print("merge_annotations - new_soup=", new_soup) # Fetch preferred merge index technique merge_index = getattr(parent.reader_app_class, 'MERGE_INDEX', 'hash') if merge_index == 'hash': # Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) debug_print("old hashes=", old_hashes) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: debug_print("Getting old annotations - count=", len(ouas)) debug_print("Getting old annotations - old_soup=", old_soup) debug_print("Getting old annotations - ouas=", ouas) ouas.extract() debug_print("Getting old annotations - ouas after extract=", ouas) debug_print("Getting old annotations - old_soup after extract=", old_soup) # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) debug_print("Getting old annotations - regurgitated_soup=", regurgitated_soup) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) debug_print("new_hashes=", sorted(new_hashes)) debug_print("old hashes=", sorted(old_hashes)) debug_print("new_hashes.difference(old_hashes)=", new_hashes.difference(old_hashes)) updates = list(new_hashes.difference(old_hashes)) debug_print("differences between old and new hashs - updates=", updates) if len(updates) and ouas is not None: debug_print("have updates and ouas") # Append new to regurgitated dtc = len(regurgitated_soup.div) debug_print("length regurgitated_soup - dtc=", dtc) for new_annotation_id in updates: debug_print("extending regurgitated_soup - new_annotation_id=", new_annotation_id) new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 # if old_soup: # debug_print("adding old_soup and new_soup") # merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup)) # else: # debug_print("just new_soup") merged_soup = unicode(sort_merged_annotations(regurgitated_soup)) else: debug_print("have updates and ouas") if regurgitated_soup: debug_print("adding old_soup and new_soup") debug_print("unicode(regurgitated_soup)=", unicode(regurgitated_soup)) debug_print("unicode(new_soup)=", unicode(new_soup)) merged_soup = unicode(regurgitated_soup)# + unicode(new_soup) else: debug_print("just new_soup") merged_soup = unicode(new_soup) debug_print("merged_soup=", merged_soup) return merged_soup elif merge_index == 'timestamp': timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: try: timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} except: continue # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: try: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} except: print("ERROR: malformed timestamp in device annotation") print(dua.prettify()) merged_soup = BeautifulSoup(ANNOTATIONS_HEADER) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_soup.div.append(annotation) return unicode(sort_merged_annotations(merged_soup))