def render_title(self) -> None: """ Render and write the title.xhtml file (title page of the book) """ logger.debug("Rendering title.xhtml...") self.render_and_write(join(OEBPS, "title.xhtml"), **dict(title=self.title))
def is_rss(node: ElementTree.Element, title: str) -> bool: """Return true if the element is indeed an RSS feed""" rss_type = node.attrib.get("type") if rss_type != "rss": logger.debug(f"Unknown type for `{title}`: {rss_type}") return False return True
def render_html_toc(self) -> None: """ Generate, render and write the toc.xhtml file (Table of Contents) """ logger.debug("Rendering toc.xhtml...") toc = self.generate_html_toc() kwargs = dict(toc=toc, title=self.title) self.render_and_write(join(OEBPS, "toc.xhtml"), **kwargs)
def render_ncx_toc(self) -> None: """ Generate, render and write the toc.ncx file (Table of Contents) """ logger.debug("Rendering toc.ncx...") navpoints = self.generate_navpoints() kwargs = dict(title=self.title, uuid=self.uuid, navpoints=navpoints) self.render_and_write(join(OEBPS, "toc.ncx"), **kwargs)
def remove_container(self, container: Optional[Container] = None) -> None: """Stop and remove a docker container""" container = container or self.container if container: logger.debug("Stopping container...") container.stop() logger.debug("Removing container...") container.remove()
def prepare_epub_dirs(self) -> None: """ Create all the basic folders in the EPUB archive """ logger.debug("Creating EPUB folders...") makedirs(join(self._dst_path, META_INF)) makedirs(join(self._dst_path, OEBPS, CONTENT)) makedirs(join(self._dst_path, OEBPS, IMAGES))
def clean_existing_containers(self) -> None: """Remove any existing mercury parser API containers""" all_containers = self.client.containers.list(all=True, sparse=True) for container in all_containers: if container.attrs["Names"] == [f"/{CONTAINER_NAME}"]: logger.debug( "Found an existing container with the same name...") self.remove_container(container)
def get_title(node: ElementTree.Element) -> Optional[str]: """Retrieve the feed's title from the XML element""" # The `title` and `text` are usually identical title = node.attrib.get("title", node.attrib.get("text")) if title: title = strip_common_unicode_chars(title) else: logger.debug("Could not find title for RSS feed") return title
def copy_fixed_files(self) -> None: """ Copy to the destination folder all the files that don't require rendering and can be copied as is """ logger.debug("Copying fixed files...") self.copy_file("mimetype") self.copy_file(join(META_INF, "container.xml")) self.copy_file(join(OEBPS, IMAGES, "cover.png")) self.copy_file(join(OEBPS, "stylesheet.css"))
def generate_spine_articles(self) -> str: """ Create <itemref> elements for all the articles in the <spine> section """ logger.debug("Generating spine articles...") spine_article_template = Template('<itemref idref="${id}"/>') return "\n\t".join([ spine_article_template.substitute(id=article.id) for article in self.articles ])
def send_email_message(server: smtplib.SMTP, msg: EmailMessage) -> bool: """Send a single EmailMessage""" logger.debug("Sending the email...") try: server.send_message(msg) return True except smtplib.SMTPException as e: logger.error( f"Caught an exception while trying to send an email.\nError: {e}") return False
def run_mercury_container(self) -> Container: """Launch a new mercury-parser docker container""" logger.debug("Launching a new mercury-parser Docker container...") self.container = self.client.containers.run( "wangqiru/mercury-parser-api:latest", detach=True, ports={f"{MERCURY_PORT}/tcp": MERCURY_PORT}, name=CONTAINER_NAME, ) return self.container
def download_image(self, url: str) -> str: """ Download an image from a URL into the images folder """ logger.debug(f"Downloading image {url}...") image_name = images.get_image_filename(url) if len(image_name) > 150: image_name = image_name[:150] image_path = join(self.images_path, image_name) images.download_image(url, image_path) return image_name
def generate_manifest_articles(self) -> str: """ Create <item> elements for all the articles in the <manifest> section """ logger.debug("Generating manifest articles...") manifest_article_template = Template( '<item id="${id}" href="content/${id}.xhtml" media-type="application/xhtml+xml"/>' ) return "\n\t".join([ manifest_article_template.substitute(id=article.id) for article in self.articles ])
def generate_navpoints(self) -> str: """ Create a navpoint per article for use in the toc.ncx file """ logger.debug("Generating navpoints...") template = self.get_template(join(_R2K, "navpoint.xml")) navpoints = [ dict(id=article.id, title=article.title, order=i + NAVPOINT_OFFSET) for i, article in enumerate(self.articles) ] return "\n\t\t".join( [template.substitute(**navpoint) for navpoint in navpoints])
def create_epub(raw_articles: List[Article], title: str) -> str: """ Create an EPUB book from multiple articles. :returns temp path to created ebook """ epub_path = mkdtemp() logger.debug(f"Creating epub folder in {epub_path}") articles = [ EPUBArticle(raw_article, epub_path) for raw_article in raw_articles ] book = EPUB(articles, title, epub_path) return book.build()
def validate_container_is_up() -> None: """Try to connect to the mercury parser service several times. Quit app if not successful""" errors = set() logger.debug( f"Launched container at {BASE_MERCURY_URL}. Validating it's up...") while retries := CONNECTION_ATTEMPTS: try: requests.get(BASE_MERCURY_URL) logger.debug("Connected!") return except ConnectionError as e: errors.add(e) sleep(1) retries -= 1
def generate_manifest_images(self) -> str: """ Create <item> elements for all the images in the <manifest> section """ logger.debug("Generating manifest images...") manifest_image_template = Template( '<item id="${id}" href="images/${id}" media-type="image/${ext}"/>') manifest_images: List[dict] = [ dict(id=image_name, ext=images.get_img_extension(image_name)) for image_name in listdir(join(self._dst_path, OEBPS, IMAGES)) if image_name != "cover.png" ] return "\n\t".join([ manifest_image_template.substitute(**image) for image in manifest_images ])
def get_feeds_from_url(url: str) -> list: """ Try to parse the URL and find any RSS feeds in the webpage Adapted from: https://gist.github.com/alexmill/9bc634240531d81c3abe """ logger.info(f"Attempting to find RSS feeds from {url}...") # If the URL itself is a proper RSS feed, just return it if is_rss_feed(url): logger.debug("URL is already a proper RSS feed") return [url] html = get_html(url) possible_feeds = get_feeds_from_links(html) + get_feeds_from_atags( url, html) return [url for url in set(possible_feeds) if is_rss_feed(url)]
def set_content(msg: EmailMessage, title: str, url: Optional[str], attachment_path: Optional[str]) -> None: """Either set the text content of the email message, or attach an attachment, based on the current parser""" if attachment_path: # We are marking the attachment as HTML, although it's an epub, because kindle doesn't officially accept # EPUB files in emails, but unofficially it will convert the file with kindlegen and it'll work fine # Reference: https://www.amazon.com/gp/sendtokindle/email filename = f"{title}.html" logger.debug(f"Setting attachment for {title}") with open(attachment_path, "rb") as f: msg.add_attachment( f.read(), maintype="text", subtype=f'html; charset=utf-8; name="{filename}"', filename=filename, ) elif url: logger.debug(f"Setting email content to {url}") msg.set_content(url)
def compress_epub(self) -> str: """ Create the EPUB ZIP archive First add the `mimetype` file - EPUB specs say it must be first and uncompressed. Then, recursively add all the files and folders under META-INF and OEBPS """ logger.debug("Creating an epub archive...") epub_name = f"{self.id}.epub" epub_path = join(mkdtemp(prefix="epub"), epub_name) # The EPUB must contain the META-INF and mimetype files at the root, so with ZipFile(epub_path, "w") as epub: # Add the mimetype file first (as is required by EPUB format) and set it to be uncompressed epub.write(join(self._dst_path, MIMETYPE), arcname=MIMETYPE, compress_type=ZIP_STORED) self.recursively_add_files_to_epub_archive(epub) return epub_path
def __exit__(self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]) -> bool: """ Context manager __exit__ for MercuryParser 1. Remove the container 2. Stop the program for any expected errors """ self.remove_container() if exc_val: if isinstance(exc_val, (DockerAPIError, ConnectionError)): logger.error( "Could not connect to Docker. Run with -v to get more details" ) logger.debug(f"Error info:\n{exc_val}") sys.exit(1) else: raise exc_val return True
def render_opf(self) -> None: """ Render the content.opf XML file content.opf requires the following: 1. <item> elements for all the articles in the <manifest> section 2. <item> elements for all the images in the <manifest> section 3. <itemref> elements for all the articles in the <spine> section """ logger.debug("Generating content.opf...") manifest_articles = self.generate_manifest_articles() manifest_images = self.generate_manifest_images() spine_articles = self.generate_spine_articles() kwargs = dict( title=self.title, date=self.date, uuid=self.uuid, manifest_articles=manifest_articles, manifest_images=manifest_images, spine_articles=spine_articles, ) self.render_and_write(join(OEBPS, "content.opf"), **kwargs)
def get_parsed_doc(url: str) -> dict: """Make an HTTP call to the mercury API and get the parsed document""" full_url = f"{BASE_MERCURY_URL}?url={url}" logger.debug("Parsing article with Mercury Parser...") logger.debug(f"Sending request to {full_url}") result = requests.get(full_url).json() logger.debug("Finished parsing") return result
def parse_images(self, raw_content: str) -> str: """ Parse and download images in the article Go over the content of the article and: 1. Find all the `img` tags in the HTML 2. Download all the images to the `images` folder in the EPUB dir 3. Set the relative paths to those images in the HTML content 4. Update the `content` attribute with the new HTML content """ soup = BeautifulSoup(raw_content, "html.parser") logger.debug("Looking for images...") for img in soup.find_all("img"): img_url = images.get_img_url(self.url, img) if not img_url: continue image_name = self.download_image(img_url) # Ad the articles live in the `content` folder, we need to go one level up image_path = join("..", IMAGES, image_name) img["src"] = image_path return soup.decode()
def render_articles(self) -> None: """ Go over all the articles and write their formatted content to disk For each article: 1. Parse its content (also downloads images) 2. If the parse did not succeed do nothing 3. If the article was parsed successfully, use the `article.xhtml` template to create the final article """ logger.debug("Rendering articles...") parser_cls = self._get_parser_class() with parser_cls() as parser: for article in self.articles: if not article.parse(parser): continue kwargs = dict(title=article.title, author=article.author, date=article.date, content=article.content) article_path = join(OEBPS, CONTENT, f"{article.id}.xhtml") article_html = self.render_template( join(OEBPS, CONTENT, "article.xhtml"), **kwargs) self.write_file(article_html, article_path)
def send_email_messages(msgs: List[EmailMessage]) -> int: """Send an email""" messages_sent = 0 try: logger.debug("Connecting to SMTP...") with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server: server.ehlo() logger.debug("Logging into the SMTP server...") server.login(config.send_from, config.password) for msg in msgs: if send_email_message(server, msg): messages_sent += 1 logger.debug("Email sent successfully!") except smtplib.SMTPException as e: logger.error( f"Caught an exception while trying to send an email.\nError: {e}") return messages_sent
def get_url(node: ElementTree.Element, title: str) -> Optional[str]: """Retrieve the feed's URL from the XML element""" url = node.attrib.get("xmlUrl") if not url: logger.debug(f"Could not find URL for `{title}`") return url