def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=["h1", "h2", "h3", "h4", "h5", "h6", "p", "span", "font", "br"]) images = soup.findAll("img") if matches is None and len(images) == 1 and images[0].get("alt", "") == "cover": img = images[0] img = os.path.join(base, *img["src"].split("/")) if os.path.exists(img): return open(img, "rb").read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find("body") if body is not None: text = u"".join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll("img", src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]["src"].split("/")) if os.path.exists(img): return open(img, "rb").read()
def search_for_asin_on_amazon(self, query): '''Search for book's asin on amazon using given query''' query = urlencode({'keywords': query}) url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[ 9:] + '&' + query try: response = open_url(self._connections['amazon'], url) except PageDoesNotExist: return None # check to make sure there are results if ('did not match any products' in response and 'Did you mean:' not in response and 'so we searched in All Departments' not in response): return None soup = BeautifulSoup(response) results = soup.findAll('div', {'class': 's-result-list'}) if not results: return None for result in results: if 'Buy now with 1-Click' in str(result): asin_search = AMAZON_ASIN_PAT.search(str(result)) if asin_search: return asin_search.group(1) return None
def get_annotations_date_range(self): ''' Find oldest, newest annotation in annotated books initial values of self.oldest, self.newest are reversed to allow update comparisons if no annotations, restore to correct values ''' annotations_found = False for cid in self.annotation_map: mi = self.cdb.get_metadata(cid, index_is_id=True) if self.field == 'Comments': soup = BeautifulSoup(mi.comments) else: soup = BeautifulSoup(mi.get_user_metadata(self.field, False)['#value#']) uas = soup.findAll('div', 'annotation') for ua in uas: annotations_found = True timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < self.oldest_annotation: self.oldest_annotation = timestamp if timestamp > self.newest_annotation: self.newest_annotation = timestamp if not annotations_found: temp = self.newest_annotation self.newest_annotation = self.oldest_annotation self.oldest_annotation = temp
def get_annotations_date_range(self): ''' Find oldest, newest annotation in annotated books initial values of self.oldest, self.newest are reversed to allow update comparisons if no annotations, restore to correct values ''' annotations_found = False for cid in self.annotation_map: mi = self.cdb.get_metadata(cid, index_is_id=True) if self.field == 'Comments': soup = BeautifulSoup(mi.comments) else: soup = BeautifulSoup( mi.get_user_metadata(self.field, False)['#value#']) uas = soup.findAll('div', 'annotation') for ua in uas: annotations_found = True timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < self.oldest_annotation: self.oldest_annotation = timestamp if timestamp > self.newest_annotation: self.newest_annotation = timestamp if not annotations_found: temp = self.newest_annotation self.newest_annotation = self.oldest_annotation self.oldest_annotation = temp
def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'font', 'br']) images = soup.findAll('img') if matches is None and len(images) == 1 and \ images[0].get('alt', '')=='cover': img = images[0] img = os.path.join(base, *img['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find('body') if body is not None: text = u''.join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll('img', src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read()
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def search_for_asin_on_amazon(self, query): '''Search for book's asin on amazon using given query''' query = urlencode({'keywords': query}) url = '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query try: response = open_url(self._connections['amazon'], url) except PageDoesNotExist: return None # check to make sure there are results if ('did not match any products' in response and 'Did you mean:' not in response and 'so we searched in All Departments' not in response): return None soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results: return None for result in results: if 'Buy now with 1-Click' in str(result): asin_search = AMAZON_ASIN_PAT.search(str(result)) if asin_search: return asin_search.group(1) return None
def extract_calibre_cover(raw, base, log): from calibre.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'font', 'br']) images = soup.findAll('img') if matches is None and len(images) == 1 and \ images[0].get('alt', '')=='cover': img = images[0] img = os.path.join(base, *img['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read() # Look for a simple cover, i.e. a body with no text and only one <img> tag if matches is None: body = soup.find('body') if body is not None: text = u''.join(map(unicode, body.findAll(text=True))) if text.strip(): # Body has text, abort return images = body.findAll('img', src=True) if 0 < len(images) < 2: img = os.path.join(base, *images[0]['src'].split('/')) if os.path.exists(img): return open(img, 'rb').read()
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL | re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance( self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance( self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) soup = BeautifulSoup(usrc) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = BeautifulSoup(replace) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_asin(self, connection): query = urlencode({'keywords': '%s - %s' % (self._title, self._author)}) try: connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = connection.getresponse().read() except: try: connection.close() if self._proxy: connection = HTTPConnection(self._http_address, self._http_port) connection.set_tunnel('www.amazon.com', 80) else: connection = HTTPConnection('www.amazon.com') connection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = connection.getresponse().read() except: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_CONNECT_TO_AMAZON raise Exception(self._status_message) # check to make sure there are results if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE raise Exception(self._status_message) soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results or len(results) == 0: self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_PAGE raise Exception(self._status_message) for r in results: if 'Buy now with 1-Click' in str(r): asinSearch = self.AMAZON_ASIN_PAT.search(str(r)) if asinSearch: self._asin = asinSearch.group(1) mi = self._db.get_metadata(self._book_id) identifiers = mi.get_identifiers() identifiers['mobi-asin'] = self._asin mi.set_identifiers(identifiers) self._db.set_metadata(self._book_id, mi) self._book_settings.prefs['asin'] = self._asin return connection self._status = self.FAIL self._status_message = self.FAILED_COULD_NOT_FIND_AMAZON_ASIN raise Exception(self._status_message)
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup): ''' comments_soup: comments potentially with user_annotations ''' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( plugin_prefs.get( 'COMMENTS_DIVIDER', '· · • · ✦ · • · ·' )) # Remove the old comments_divider cds = comments_soup.find('div', 'comments_divider') if cds: cds.extract() # Existing annotations? uas = comments_soup.find('div', 'user_annotations') if uas: # Save the existing annotations to old_soup old_soup = BeautifulSoup(unicode(uas)) # Remove any hrs from old_soup hrs = old_soup.findAll('hr') if hrs: for hr in hrs: hr.extract() # Remove the existing annotations from comments_soup uas.extract() # Merge old_soup with new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(merge_annotations(parent, cid, old_soup, new_soup)) else: # No existing, just merge comments_soup with already sorted new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(new_soup) return merged_soup
def merge_annotations_with_comments(parent, cid, comments_soup, new_soup): ''' comments_soup: comments potentially with user_annotations ''' # Prepare a new COMMENTS_DIVIDER comments_divider = '<div class="comments_divider"><p style="text-align:center;margin:1em 0 1em 0">{0}</p></div>'.format( plugin_prefs.get('COMMENTS_DIVIDER', '· · • · ✦ · • · ·')) # Remove the old comments_divider cds = comments_soup.find('div', 'comments_divider') if cds: cds.extract() # Existing annotations? uas = comments_soup.find('div', 'user_annotations') if uas: # Save the existing annotations to old_soup old_soup = BeautifulSoup(unicode(uas)) # Remove any hrs from old_soup hrs = old_soup.findAll('hr') if hrs: for hr in hrs: hr.extract() # Remove the existing annotations from comments_soup uas.extract() # Merge old_soup with new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(merge_annotations(parent, cid, old_soup, new_soup)) else: # No existing, just merge comments_soup with already sorted new_soup merged_soup = unicode(comments_soup) + \ unicode(comments_divider) + \ unicode(new_soup) return merged_soup
def get_asin(self): query = urlencode({'keywords': '%s' % self.title_and_author}) try: self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = self._aConnection.getresponse().read() except: try: self._aConnection.close() if self._proxy: self._aConnection = HTTPConnection(self._http_address, self._http_port) self._aConnection.set_tunnel('www.amazon.com', 80) else: self._aConnection = HTTPConnection('www.amazon.com') self._aConnection.request('GET', '/s/ref=sr_qz_back?sf=qz&rh=i%3Adigital-text%2Cn%3A154606011%2Ck%3A' + query[9:] + '&' + query, headers=self.HEADERS) response = self._aConnection.getresponse().read() except: return None # check to make sure there are results if 'did not match any products' in response and not 'Did you mean:' in response and not 'so we searched in All Departments' in response: return None soup = BeautifulSoup(response) results = soup.findAll('div', {'id': 'resultsCol'}) if not results or len(results) == 0: return None for r in results: if 'Buy now with 1-Click' in str(r): asinSearch = self.AMAZON_ASIN_PAT.search(str(r)) if asinSearch: asin = asinSearch.group(1) mi = self._db.get_metadata(self._book_id) identifiers = mi.get_identifiers() identifiers['mobi-asin'] = asin mi.set_identifiers(identifiers) self._db.set_metadata(self._book_id, mi) return asin
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll('a'): if not a.has_key('href'): continue purl = urlparse(unquote(a['href'])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = ''.join([unicode(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, "rb").read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll("a"): if not a.has_key("href"): continue purl = urlparse(unquote(a["href"])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = "".join([unicode(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def update_results(self, trigger): #self._log_location(trigger) reader_to_match = str(self.find_annotations_reader_comboBox.currentText()) color_to_match = str(self.find_annotations_color_comboBox.currentText()) text_to_match = str(self.find_annotations_text_lineEdit.text()) note_to_match = str(self.find_annotations_note_lineEdit.text()) from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t() to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t() annotation_map = self.annotated_books_scanner.annotation_map #field = self.prefs.get("cfg_annotations_destination_field", None) field = get_cc_mapping('annotations', 'field', None) db = self.opts.gui.current_db matched_titles = [] self.matched_ids = set() for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) soup = None if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: if mi.get_user_metadata(field, False)['#value#'] is not None: soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#']) if soup: uas = soup.findAll('div', 'annotation') for ua in uas: # Are we already logged? if cid in self.matched_ids: continue # Check reader if reader_to_match != self.GENERIC_READER: this_reader = ua['reader'] if this_reader != reader_to_match: continue # Check color if color_to_match != self.GENERIC_STYLE: this_color = ua.find('table')['color'] if this_color != color_to_match: continue # Check date range, allow for mangled timestamp try: timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < from_date or timestamp > to_date: continue except: continue highlight_text = '' try: pels = ua.findAll('p', 'highlight') highlight_text = '\n'.join([p.string for p in pels]) except: pass if text_to_match > '': if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE): continue note_text = '' try: nels = ua.findAll('p', 'note') note_text = '\n'.join([n.string for n in nels]) except: pass if note_to_match > '': if not re.search(note_to_match, note_text, flags=re.IGNORECASE): continue # If we made it this far, add the id to matched_ids self.matched_ids.add(cid) matched_titles.append(mi.title) # Update the results box matched_titles.sort() if len(annotation_map): if len(matched_titles): first_match = ("<i>%s</i>" % matched_titles[0]) if len(matched_titles) == 1: results = first_match else: results = first_match + (_(" and {0} more.").format(len(matched_titles) - 1)) self.result_label.setText('<p style="color:blue">{0}</p>'.format(results)) else: self.result_label.setText('<p style="color:red">{0}</p>'.format(_('no matches'))) else: self.result_label.setText('<p style="color:red">{0}</p>'.format(_('no annotated books in library'))) self.resize_dialog()
def update_results(self, trigger): #self._log_location(trigger) reader_to_match = str(self.find_annotations_reader_comboBox.currentText()) color_to_match = str(self.find_annotations_color_comboBox.currentText()) text_to_match = str(self.find_annotations_text_lineEdit.text()) note_to_match = str(self.find_annotations_note_lineEdit.text()) from_date = self.find_annotations_date_from_dateEdit.dateTime().toTime_t() to_date = self.find_annotations_date_to_dateEdit.dateTime().toTime_t() annotation_map = self.annotated_books_scanner.annotation_map #field = self.prefs.get("cfg_annotations_destination_field", None) field = get_cc_mapping('annotations', 'field', None) db = self.opts.gui.current_db matched_titles = [] self.matched_ids = set() for cid in annotation_map: mi = db.get_metadata(cid, index_is_id=True) soup = None if field == 'Comments': if mi.comments: soup = BeautifulSoup(mi.comments) else: if mi.get_user_metadata(field, False)['#value#'] is not None: soup = BeautifulSoup(mi.get_user_metadata(field, False)['#value#']) if soup: uas = soup.findAll('div', 'annotation') for ua in uas: # Are we already logged? if cid in self.matched_ids: continue # Check reader if reader_to_match != self.GENERIC_READER: this_reader = ua['reader'] if this_reader != reader_to_match: continue # Check color if color_to_match != self.GENERIC_STYLE: this_color = ua.find('table')['color'] if this_color != color_to_match: continue # Check date range, allow for mangled timestamp try: timestamp = float(ua.find('td', 'timestamp')['uts']) if timestamp < from_date or timestamp > to_date: continue except: continue highlight_text = '' try: pels = ua.findAll('p', 'highlight') for pel in pels: highlight_text += pel.string + '\n' except: pass if text_to_match > '': if not re.search(text_to_match, highlight_text, flags=re.IGNORECASE): continue note_text = '' try: nels = ua.findAll('p', 'note') for nel in nels: note_text += nel.string + '\n' except: pass if note_to_match > '': if not re.search(note_to_match, note_text, flags=re.IGNORECASE): continue # If we made it this far, add the id to matched_ids self.matched_ids.add(cid) matched_titles.append(mi.title) # Update the results box matched_titles.sort() if len(annotation_map): if len(matched_titles): first_match = ("<i>%s</i>" % matched_titles[0]) if len(matched_titles) == 1: results = first_match else: results = first_match + (" and %d more." % (len(matched_titles) - 1)) self.result_label.setText('<p style="color:blue">{0}</p>'.format(results)) else: self.result_label.setText('<p style="color:red">no matches</p>') else: self.result_label.setText('<p style="color:red">no annotated books in library</p>') self.resize_dialog()
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup(comments) result = BeautifulSoup() rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if type(token) is NavigableString: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc,prepare_string_for_xml(token)) ptc += 1 elif type(token) in (CData, Comment, Declaration, ProcessingInstruction): continue elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: result.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 result.insert(rtc, token) rtc += 1 if open_pTag: result.insert(rtc, pTag) for p in result.findAll('p'): p['class'] = 'description' for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode(t))) return result.renderContents(encoding=None)
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, unicode): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>'%x.replace(u'\n', u'<br />') for x in comments.split('\n\n')] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace(lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = BeautifulSoup(comments) result = BeautifulSoup() rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if type(token) is NavigableString: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc,prepare_string_for_xml(token)) ptc += 1 elif type(token) in (CData, Comment, Declaration, ProcessingInstruction): continue elif token.name in ['br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr']: if not open_pTag: pTag = Tag(result,'p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: result.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 result.insert(rtc, token) rtc += 1 if open_pTag: result.insert(rtc, pTag) for p in result.findAll('p'): p['class'] = 'description' for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode(t))) return result.renderContents(encoding=None)
def comments_to_html(comments): """ Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() """ if not comments: return u"<p></p>" if not isinstance(comments, unicode): comments = comments.decode(preferred_encoding, "replace") if comments.lstrip().startswith("<"): # Comment is already HTML do not mess with it return comments if "<" not in comments: comments = prepare_string_for_xml(comments) parts = [u'<p class="description">%s</p>' % x.replace(u"\n", u"<br />") for x in comments.split("\n\n")] return "\n".join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u"<p></p>" # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub(lambda m: m.group().replace(".", ".\r"), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace( lost_cr.group(), "%s%s\n\n%s" % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3)) ) comments = comments.replace(u"\r", u"") # Convert \n\n to <p>s comments = comments.replace(u"\n\n", u"<p>") # Convert solo returns to <br /> comments = comments.replace(u"\n", "<br />") # Convert two hyphens to emdash comments = comments.replace("--", "—") soup = BeautifulSoup(comments) result = BeautifulSoup() rtc = 0 open_pTag = False all_tokens = list(soup.contents) for token in all_tokens: if type(token) is NavigableString: if not open_pTag: pTag = Tag(result, "p") open_pTag = True ptc = 0 pTag.insert(ptc, prepare_string_for_xml(token)) ptc += 1 elif type(token) in (CData, Comment, Declaration, ProcessingInstruction): continue elif token.name in ["br", "b", "i", "em", "strong", "span", "font", "a", "hr"]: if not open_pTag: pTag = Tag(result, "p") open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: result.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 result.insert(rtc, token) rtc += 1 if open_pTag: result.insert(rtc, pTag) for p in result.findAll("p"): p["class"] = "description" for t in result.findAll(text=True): t.replaceWith(prepare_string_for_xml(unicode(t))) return result.renderContents(encoding=None)