def postprocess_html(self, soup, first_fetch): author_general = soup.find('span', { 'class': 'author_general' }) author_general.em.extract() # the complete content full_div = None transcript_div = soup.find('div', { 'id': 'transcript' }) if transcript_div: # that's an interview # get all <div class="qa" /> qa_div_list = list(find_by_class(transcript_div, 'div', 'qa')) for qa_div in qa_div_list: qa_div.extract() # replace all <a class="question_link">...</a> with <strong>...</strong> question_link = qa_div.find('a', { 'class': 'question_link' }) question_strong = Tag(soup, 'strong') question_strong.append(question_link.string) question_link.replaceWith(question_strong) full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next() # clean the <h1 /> full_div.h1.span.extract() title_div = full_div.h1.div title_div.replaceWith(title_div.string) # clear the presentation area for div in full_div.findAll('div'): div.extract() # add qa list back to presentation area for qa_div in qa_div_list: full_div.append(qa_div) else: # text only without title text_div = find_by_class(soup, 'div', 'text_info').next() text_div.extract() for other in text_div.findAll('div'): other.extract() # full_div contains title full_div = soup.find('div', { 'id': 'content' }) for other in full_div.findAll('div'): other.extract() full_div.append(text_div) # keep full_div in <body /> only full_div.extract() for other in soup.body: other.extract() soup.body.append(full_div) return soup
def construct(self, book_notes): ''' Given a list of notes, render HTML ''' soup = None if book_notes: soup = BeautifulSoup('''<div class="{0}"></div>'''.format('book_notes')) for note in book_notes: div_tag = Tag(soup, 'div', [('class', "book_note")]) p_tag = Tag(soup, 'p', [('class', "book_note"), ('style', "{0}".format(self._get_note_style()))]) p_tag.append(note) div_tag.append(p_tag) soup.div.append(div_tag) return soup
def construct(self, book_notes): ''' Given a list of notes, render HTML ''' soup = None if book_notes: soup = BeautifulSoup( '''<div class="{0}"></div>'''.format('book_notes')) for note in book_notes: div_tag = Tag(soup, 'div', [('class', "book_note")]) p_tag = Tag(soup, 'p', [('class', "book_note"), ('style', "{0}".format(self._get_note_style()))]) p_tag.append(note) div_tag.append(p_tag) soup.div.append(div_tag) return soup
def to_HTML(self, header=''): ''' Generate HTML with user-specified CSS, element order ''' # Retrieve CSS prefs from calibre_plugins.annotations.appearance import default_elements stored_css = plugin_prefs.get('appearance_css', default_elements) elements = [] for element in stored_css: elements.append(element['name']) if element['name'] == 'Note': note_style = re.sub('\n', '', element['css']) elif element['name'] == 'Text': text_style = re.sub('\n', '', element['css']) elif element['name'] == 'Timestamp': ts_style = re.sub('\n', '', element['css']) # Additional CSS for timestamp color and bg to be formatted datetime_style = ("background-color:{0};color:{1};" + ts_style) # Order the elements according to stored preferences comments_body = '' for element in elements: if element == 'Text': comments_body += '{text}' elif element == 'Note': comments_body += '{note}' elif element == 'Timestamp': ts_css = '''<table cellpadding="0" width="100%" style="{ts_style}" color="{color}"> <tr> <td class="location" style="text-align:left">{location}</td> <td class="timestamp" uts="{unix_timestamp}" style="text-align:right">{friendly_timestamp}</td> </tr> </table>''' comments_body += re.sub(r'>\s+<', r'><', ts_css) # self._log_location("comments_body='%s'" % comments_body) if self.annotations: soup = BeautifulSoup(ANNOTATIONS_HEADER) dtc = 0 # Add the annotations for i, agroup in enumerate( sorted(self.annotations, key=self._annotation_sorter)): # self._log_location("agroup='%s'" % agroup) location = agroup.location if location is None: location = '' friendly_timestamp = self._timestamp_to_datestr( agroup.timestamp) text = '' if agroup.text: # self._log_location("agroup.text='%s'" % agroup.text) for agt in agroup.text: # self._log_location("agt='%s'" % agt) text += '<p class="highlight" style="{0}">{1}</p>'.format( text_style, agt) note = '' if agroup.note: # self._log_location("agroup.note='%s'" % agroup.note) for agn in agroup.note: # self._log_location("agn='%s'" % agn) note += '<p class="note" style="{0}">{1}</p>'.format( note_style, agn) try: dt_bgcolor = COLOR_MAP[agroup.highlightcolor]['bg'] dt_fgcolor = COLOR_MAP[agroup.highlightcolor]['fg'] except: if agroup.highlightcolor is None: msg = "No highlight color specified, using Default" else: msg = "Unknown color '%s' specified" % agroup.highlightcolor self._log_location(msg) dt_bgcolor = COLOR_MAP['Default']['bg'] dt_fgcolor = COLOR_MAP['Default']['fg'] if agroup.hash is not None: # Use existing hash when re-rendering annotation_hash = agroup.hash else: m = hashlib.md5() m.update(text.encode('utf-8')) m.update(note.encode('utf-8')) annotation_hash = m.hexdigest() try: ka_soup = BeautifulSoup() divTag = ka_soup.new_tag('div') # self._log_location("Used ka_soup.new_tag to create tag: %s" % divTag) except: divTag = Tag(BeautifulSoup(), 'div') # self._log_location("Used Tag(BeautifulSoup() to create tag: %s" % divTag) content_args = { 'color': agroup.highlightcolor, 'friendly_timestamp': friendly_timestamp, 'location': location, 'note': note, 'text': text, 'ts_style': datetime_style.format(dt_bgcolor, dt_fgcolor), 'unix_timestamp': agroup.timestamp, } # self._log_location("Generated comment soup: %s" % BeautifulSoup(comments_body.format(**content_args))) comments_body_soup = BeautifulSoup( comments_body.format(**content_args)) # self._log_location("Generated comment soup: comments_body_soup=%s" % comments_body_soup) # self._log_location("Generated comment soup: comments_body_soup.body=%s" % comments_body_soup.body) # self._log_location("Generated comment soup: comments_body_soup.body.children=%s" % comments_body_soup.body.children) # self._log_location("Generated comment soup: comments_body_soup.body.contents=%s" % comments_body_soup.body.contents) # self._log_location("Generated comment soup: len(comments_body_soup.body.contents)=%s" % len(comments_body_soup.body.contents)) # for i in range(0, len(comments_body_soup.body.contents)): # self._log_location("i=%s" % i) # self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[i]) while len(comments_body_soup.body.contents) > 0: # self._log_location("comment_body_tag=%s" % comments_body_soup.body.contents[0]) divTag.append(comments_body_soup.body.contents[0]) divTag['class'] = "annotation" divTag['genre'] = '' if agroup.genre: divTag['genre'] = escape(agroup.genre) divTag['hash'] = annotation_hash divTag['location_sort'] = agroup.location_sort divTag['reader'] = agroup.reader_app divTag['style'] = ANNOTATION_DIV_STYLE # self._log_location("An annotation - divTag=%s" % divTag) soup.div.insert(dtc, divTag) # self._log_location("Full soup after adding annotation - soup=%s" % soup) dtc += 1 if i < len(self.annotations) - 1 and \ plugin_prefs.get('appearance_hr_checkbox', False): soup.div.insert( dtc, BeautifulSoup( plugin_prefs.get('HORIZONTAL_RULE', '<hr width="80%" />'))) dtc += 1 else: soup = BeautifulSoup(ANNOTATIONS_HEADER) return unicode(soup)
def postprocess_html(self, soup, first_fetch): author_general = soup.find('span', {'class': 'author_general'}) author_general.em.extract() # the complete content full_div = None transcript_div = soup.find('div', {'id': 'transcript'}) if transcript_div: # that's an interview # get all <div class="qa" /> qa_div_list = list(find_by_class(transcript_div, 'div', 'qa')) for qa_div in qa_div_list: qa_div.extract() # replace all <a class="question_link">...</a> with <strong>...</strong> question_link = qa_div.find('a', {'class': 'question_link'}) question_strong = Tag(soup, 'strong') question_strong.append(question_link.string) question_link.replaceWith(question_strong) full_div = find_by_class(soup.find('div', {'id': 'content'}), 'div', 'presentation_full').next() # clean the <h1 /> full_div.h1.span.extract() title_div = full_div.h1.div title_div.replaceWith(title_div.string) # clear the presentation area for div in full_div.findAll('div'): div.extract() # add qa list back to presentation area for qa_div in qa_div_list: full_div.append(qa_div) else: # text only without title text_div = find_by_class(soup, 'div', 'text_info').next() text_div.extract() for other in text_div.findAll('div'): other.extract() # full_div contains title full_div = soup.find('div', {'id': 'content'}) for other in full_div.findAll('div'): other.extract() full_div.append(text_div) full_div.extract() nav_div = soup.body.div nav_div.extract() # keep nav_div and full_div in <body /> only for other in soup.body: other.extract() soup.body.append(nav_div) soup.body.append(full_div) return soup
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' if False: ''' Older technique: Use hashes to merge annotations ''' #Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup( parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode( sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode( sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup else: ''' Newer technique: Use timestamps to merge annotations ''' timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: #print("sua: %s" % sua.prettify()) timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate annotations with current CSS rerendered_annotations = parent.opts.db.rerender_to_html( TRANSIENT_DB, cid) regurgitated_soup = BeautifulSoup(rerendered_annotations) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} merged_annotations = Tag(BeautifulSoup(), 'div', [('class', "user_annotations"), ('style', 'margin:0')]) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts][ 'device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find( 'div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find( 'div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_annotations.append(annotation) merged_annotations = sort_merged_annotations(merged_annotations) # Update new_soup with merged_annotations new_soup_uas = new_soup.find('div', 'user_annotations') new_soup_uas.replaceWith(merged_annotations) return unicode(new_soup)
def merge_annotations(parent, cid, old_soup, new_soup): ''' old_soup, new_soup: BeautifulSoup() Need to strip <hr>, re-sort based on location, build new merged_soup with optional interleaved <hr> elements. ''' TRANSIENT_DB = 'transient' if False: ''' Older technique: Use hashes to merge annotations ''' #Get the hashes of any existing annotations oiuas = old_soup.findAll('div', 'annotation') old_hashes = set([ua['hash'] for ua in oiuas]) # Extract old user_annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate old_soup with current CSS regurgitated_soup = BeautifulSoup(parent.opts.db.rerender_to_html(TRANSIENT_DB, cid)) # Find new annotations uas = new_soup.findAll('div', 'annotation') new_hashes = set([ua['hash'] for ua in uas]) updates = list(new_hashes.difference(old_hashes)) if len(updates) and ouas is not None: # Append new to regurgitated dtc = len(regurgitated_soup.div) for new_annotation_id in updates: new_annotation = new_soup.find('div', {'hash': new_annotation_id}) regurgitated_soup.div.insert(dtc, new_annotation) dtc += 1 if old_soup: merged_soup = unicode(old_soup) + unicode(sort_merged_annotations(regurgitated_soup)) else: merged_soup = unicode(sort_merged_annotations(regurgitated_soup)) else: if old_soup: merged_soup = unicode(old_soup) + unicode(new_soup) else: merged_soup = unicode(new_soup) return merged_soup else: ''' Newer technique: Use timestamps to merge annotations ''' timestamps = {} # Get the timestamps and hashes of the stored annotations suas = old_soup.findAll('div', 'annotation') for sua in suas: #print("sua: %s" % sua.prettify()) timestamp = sua.find('td', 'timestamp')['uts'] timestamps[timestamp] = {'stored_hash': sua['hash']} # Rerender stored annotations ouas = old_soup.find('div', 'user_annotations') if ouas: ouas.extract() # Capture existing annotations parent.opts.db.capture_content(ouas, cid, TRANSIENT_DB) # Regurgitate annotations with current CSS rerendered_annotations = parent.opts.db.rerender_to_html(TRANSIENT_DB, cid) regurgitated_soup = BeautifulSoup(rerendered_annotations) # Add device annotation timestamps and hashes duas = new_soup.findAll('div', 'annotation') for dua in duas: timestamp = dua.find('td', 'timestamp')['uts'] if timestamp in timestamps: timestamps[timestamp]['device_hash'] = dua['hash'] else: timestamps[timestamp] = {'device_hash': dua['hash']} merged_annotations = Tag(BeautifulSoup(), 'div', [('class', "user_annotations"), ('style','margin:0')]) for ts in sorted(timestamps): if 'stored_hash' in timestamps[ts] and not 'device_hash' in timestamps[ts]: # Stored only - add from regurgitated_soup annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif not 'stored_hash' in timestamps[ts] and 'device_hash' in timestamps[ts]: # Device only - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) elif timestamps[ts]['stored_hash'] == timestamps[ts]['device_hash']: # Stored matches device - add from regurgitated_soup, as user may have modified annotation = regurgitated_soup.find('div', {'hash': timestamps[ts]['stored_hash']}) elif timestamps[ts]['stored_hash'] != timestamps[ts]['device_hash']: # Device has been updated since initial capture - add from new_soup annotation = new_soup.find('div', {'hash': timestamps[ts]['device_hash']}) else: continue merged_annotations.append(annotation) merged_annotations = sort_merged_annotations(merged_annotations) # Update new_soup with merged_annotations new_soup_uas = new_soup.find('div', 'user_annotations') new_soup_uas.replaceWith(merged_annotations) return unicode(new_soup)