def ConvertFromEPUB(self, sourceFilePath: Path, outputDirectoryPath: Path) -> bool: ## # # Converts an EPUB file to a MOBI file. The output file may exist: it will be overwritten if # it does. # # @param sourceFilePath Path to the EPUB file. # @param outputDirectoryPath Path to the output directory. The output file will be created # inside it; its (base)name will be the same as the name of the # source file. The directory **has** to exist beforehand, this # method does *not* create it. # # @return **True** if the conversion has been performed successfully, **False** otherwise. # ## if not sourceFilePath.is_file(): return False elif not outputDirectoryPath.is_dir(): return False call([ "ebook-convert", Stringify(sourceFilePath), Stringify(outputDirectoryPath / (sourceFilePath.stem + ".mobi")), ], stdout=DEVNULL, stderr=DEVNULL) return True
def __init__(self, embedImages: bool = True, combinedVersion: bool = False) -> None: ## # # The constructor. # # @param embedImages Embed images in the output file. # @param combinedVersion Use the template designed for multiple stories. # ## super().__init__(embedImages) # Intialize member variables. templateFileName = \ "Templates/FormatterODT/Template.odt" \ if not combinedVersion else \ "Templates/FormatterODT/Template (Combined).odt" self._templateFilePath = GetPackageDirectory() / templateFileName self._manifestDocument = "" self._contentDocument = "" self._metadataDocument = "" self._stylesDocument = "" # Load the template. with ZipFile(self._templateFilePath, "r") as archive: self._manifestDocument = Stringify( archive.read("META-INF/manifest.xml")) self._contentDocument = Stringify(archive.read("content.xml")) self._metadataDocument = Stringify(archive.read("meta.xml")) self._stylesDocument = Stringify(archive.read("styles.xml")) # Modify the styles. EOF = self._stylesDocument.find("</office:styles>") styles = ReadTextFile(GetPackageDirectory() / "Templates/FormatterODT/Styles.xml") self._stylesDocument = self._stylesDocument[: EOF] + styles + self._stylesDocument[ EOF:]
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## contentElements = soup.select("p") contentElements[0].decompose() contentElements[1].decompose() contentElements[-1].decompose() contentElements[-2].decompose() contentElements[-3].decompose() return Chapter(title=None, content=Stringify(soup))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup] ) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Locate relevant page elements. titleElement = soup.select_one("h2#quizSubtitle") if not titleElement: logging.error("Title element not found.") return False contentElement = soup.select_one("#rescontent") if not contentElement: logging.error("Content element not found.") return False # Return. return Chapter( titleElement.get_text().strip(), Stringify(contentElement.encode_contents()) )
def ExtractChapter(self, index: int) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param index The index of the chapter to be extracted. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## if 1 == self.Story.Metadata.ChapterCount: titleElement = None contentElement = self._storySoup.select_one( "div#chapters div.userstuff") if not contentElement: logging.error("Content element not found.") return None if (landmarkElement := contentElement.select_one("h3#work")): landmarkElement.decompose() return Chapter(title=titleElement.get_text().strip() if titleElement else None, content=Stringify(contentElement.encode_contents()))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Extract the title. title = None titleElement = soup.select_one("p.highlighted-image__title > a") if titleElement: title = titleElement.get_text().strip() # Extract the content. contentElement = soup.select_one("div.storytext-container") if not contentElement: logging.error("Could find the content element.") return None # Return. return Chapter(title=title, content=Stringify(contentElement.encode_contents()))
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup] ) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Locate relevant page elements. titleElement = soup.select_one("div#chapter-outer > div.caption > div > h4") # No error-checking here. Not sure if every chapter has to have a title on WW. contentElement = soup.select_one("div#chapter-content") if not contentElement: logging.error("Content element not found.") return False # Return. return Chapter( titleElement.get_text().strip() if titleElement else "", Stringify(contentElement.encode_contents()) )
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Extract the content. contentElement = soup.select_one("div#story") if not contentElement: logging.error("Couldn't find the content element.") return None # Return. return Chapter(title=self._chapterTitles[URL] if (URL in self._chapterTitles) else None, content=Stringify(contentElement.encode_contents()))
def ConvertFromODT(self, sourceFilePath: Path, outputDirectoryPath: Path, converterFilePath: Path) -> bool: ## # # Converts an ODT file to a PDF file. The output file may exist: it will be overwritten if # it does. # # @param sourceFilePath Path to the ODT file. # @param outputDirectoryPath Path to the output directory. The output file will be created # inside it; its (base)name will be the same as the name of the # source file. The directory **has** to exist beforehand, this # method does *not* create it. # @param converterFilePath Path to the LibreOffice executable (soffice.exe/soffice). # # @return **True** if the conversion has been performed successfully, **False** otherwise. # ## if not sourceFilePath.is_file(): return False elif not outputDirectoryPath.is_dir(): return False elif not converterFilePath.is_file(): return False call([ Stringify(converterFilePath), "--headless", "--convert-to", "pdf", Stringify(sourceFilePath), "--outdir", Stringify(outputDirectoryPath), ], stdout=DEVNULL) return True
def Post( self, URL: str, payload, text: bool = True, textEncoding: str = DEFAULT_TEXT_ENCODING ) -> Optional[Union[bytes, str]]: ## # # Posts some data and receives the response. # # @param URL The URL. # @param payload The data to be posted. # @param text Should the response be converted to text? # @param textEncoding The text encoding to be used during the conversion. # # @return Retrieved response (as *bytes* or *str*), or **None**. # ## # Prepare the headers. requestHeaders = {"User-Agent": self._userAgent} # Send the request. response = self._session.post(URL, headers=requestHeaders, data=payload) if (not response) or (200 != response.status_code): return None # Process the response. data = Stringify(response.content, encoding=textEncoding) if text else response.content # Return. return data
def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## rowElements = soup.select("div#contentdata > table > tr") if (not rowElements) or len(rowElements) < 3: logging.error("Chapter page doesn't conform to expected format.") return Chapter(title=None, content=Stringify(rowElements[2].encode_contents()))
def Get(self, URL: str, text: bool = True, textEncoding: str = DEFAULT_TEXT_ENCODING, stream: bool = False) -> Optional[Union[bytes, str]]: ## # # Retrieves data using a GET request. # # @param URL The URL. # @param text Should the response be converted to text? # @param textEncoding The text encoding to be used during the conversion. # @param stream Read data stream. # # @return Retrieved response (as *bytes* or *str*), or **None**. # ## # Prepare the headers. requestHeaders = {"User-Agent": self._userAgent} # Send the request. response = self._session.get(URL, headers=requestHeaders, stream=stream) if (not response) or (200 != response.status_code): return None # Process the response. data = Stringify(response.content, encoding=textEncoding) if text else response.content # Return. return data
class ExtractorLiterotica(Extractor): def __init__(self) -> None: ## # # The constructor. # ## super().__init__() def GetSupportedHostnames(self) -> List[str]: ## # # Returns a list of hostnames supposed to be supported by the extractor. # # @return A list of supported hostnames. # ## return ["literotica.com"] def ScanChannel(self, URL: str) -> Optional[List[str]]: ## # # Scans the channel: generates the list of story URLs. # # @return **None** when the scan fails, a list of story URLs when it doesn't fail. # ## if (not URL) or (GetHostname(URL) not in self.GetSupportedHostnames()): return None # Download author's profile page. userIDMatch = re.search("\?uid\=(\d+)", URL) if not userIDMatch: return None userID = userIDMatch.group(1) userPageURL = f"{self.MEMBER_PAGE_URL}uid={userID}&page=submissions" soup = self._webSession.GetSoup(userPageURL) if not soup: return None # Locate all the stories. storyURLs = [] storyHeaderElement = soup.select_one("tr.st-top") if not storyHeaderElement: return None storyRowElement = storyHeaderElement.next_sibling while storyRowElement: if not storyRowElement.has_attr("class"): break if "root-story" in storyRowElement["class"]: anchorElement = storyRowElement.select_one("a") if (not anchorElement) or (not anchorElement.has_attr("href")): continue storyURLs.append(anchorElement["href"]) storyRowElement = storyRowElement.next_sibling elif "ser-ttl" in storyRowElement["class"]: storyRowElement = storyRowElement.next_sibling if (not storyRowElement.has_attr("class")) or ( "sl" not in storyRowElement["class"]): continue anchorElement = storyRowElement.select_one("a") if (not anchorElement) or (not anchorElement.has_attr("href")): continue storyURLs.append(anchorElement["href"]) storyRowElement = storyRowElement.next_sibling elif "sl" in storyRowElement["class"]: storyRowElement = storyRowElement.next_sibling else: break # Return. return storyURLs def _InternallyScanStory(self, URL: str, soup: Optional[BeautifulSoup]) -> bool: ## # # Scans the story: generates the list of chapter URLs and retrieves the # metadata. # # @param URL The URL of the story. # @param soup The tag soup. # # @return **False** when the scan fails, **True** when it doesn't fail. # ## # Extract basic metadata. titleElement = soup.select_one( "div.b-story-header > h1") or soup.select_one("h1.headline") if not titleElement: logging.error("Title element not found.") return False authorElement = soup.select_one( "div.b-story-header > span.b-story-user-y > a") or soup.select_one( "div.panel > div.y_eS > a.y_eU") if not authorElement: logging.error("Author element not found.") return False # Download the author's page. if not authorElement.has_attr("href"): logging.error("Can't find the URL of the author's page.") return False authorsPageURL = authorElement["href"] authorsPageSoup = self._webSession.GetSoup(authorsPageURL) if not authorsPageSoup: logging.error(f'Failed to download page: "{authorsPageURL}".') return False # Extract remaining metadata. storyRowElement = None for storyLinkElement in authorsPageSoup.select("td.fc > a"): if storyLinkElement.get_text().strip() == titleElement.get_text( ).strip(): storyRowElement = storyLinkElement.parent.parent break if not storyRowElement: logging.error( "Failed to find the story's entry on the author's page.") return False storyMetadataElements = storyRowElement.find_all("td") if len(storyMetadataElements) < 4: logging.error("Can't extract metadata from the author's page.") return False summaryElement = storyMetadataElements[1] publishedElement = storyMetadataElements[3] # Prepare metadata. title = titleElement.get_text().strip() datePublished = self._ReformatDate(publishedElement.get_text().strip()) dateUpdated = self._ReformatDate(publishedElement.get_text().strip()) # Check if the story belongs to a series. seriesRowElement = None if storyRowElement.has_attr("class") and ("sl" in storyRowElement["class"]): seriesRowElement = storyRowElement.find_previous_sibling( "tr", {"class": "ser-ttl"}) if seriesRowElement: title = seriesRowElement.get_text().strip() chapterDates = [] seriesChapterRowElement = seriesRowElement.next_sibling while seriesChapterRowElement: if (not seriesChapterRowElement.has_attr("class")) or ( "sl" not in seriesChapterRowElement["class"]): break seriesChapterAnchorElement = seriesChapterRowElement.select_one( "a") if (not seriesChapterAnchorElement) or ( not seriesChapterAnchorElement.has_attr("href")): break seriesChapterCellElements = seriesChapterRowElement.select( "td") if seriesChapterCellElements: chapterDates.append( seriesChapterCellElements[-1].get_text().strip()) self._chapterURLs.append(seriesChapterAnchorElement["href"]) seriesChapterRowElement = seriesChapterRowElement.next_sibling datePublished = self._ReformatDate(chapterDates[0]) dateUpdated = self._ReformatDate(chapterDates[-1]) else: self._chapterURLs = [self.Story.Metadata.URL] # Set the metadata. self.Story.Metadata.Title = title self.Story.Metadata.Author = authorElement.get_text().strip() self.Story.Metadata.DatePublished = datePublished self.Story.Metadata.DateUpdated = dateUpdated self.Story.Metadata.ChapterCount = len(self._chapterURLs) self.Story.Metadata.WordCount = 0 self.Story.Metadata.Summary = StripHTML( summaryElement.get_text()).strip() # Return. return True def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Find the page count of the story. pageCount = 1 if (pageSelectElement := soup.find("select", {"name": "page"})): pageCount = len(pageSelectElement.find_all("option")) if pageCount < 1: logging.error("Failed to read the story's page count.") return None if (pageSelectElement := soup.select_one("div.panel.clearfix.l_bH")): if (linkElements := pageSelectElement.find_all("a")): pageCount = int(Stringify(linkElements[-1].encode_contents()))
pageURL = URL + f"?page={pageIndex}" soup = self._webSession.GetSoup(pageURL) if not soup: logging.error(f'Failed to download page: "{pageURL}".') return None contentElement = soup.select_one( "div.b-story-body-x > div") or soup.select_one( "div.panel.article") if not contentElement: logging.error("Story content element not found.") return None content += "<br/><br/>" + Stringify( contentElement.encode_contents()) # Return. return Chapter(title=None, content=content) def _GetNormalizedStoryURL(self, URL: str) -> str: ## # # Returns a normalized story URL, i.e. one that can be used for anything. # # @param URL Input URL (given by the user). # # @return Normalized URL. #
f"Only {len(chapterElements)} chapter(s) located. " f"The story supposedly has {self.Story.Metadata.ChapterCount} chapter(s)." ) return None currentChapterElement = chapterElements[index - 1] titleElement = currentChapterElement.select_one("h3.title") contentElement = currentChapterElement.select_one("div.userstuff") if (landmarkElement := contentElement.select_one("h3#work")): landmarkElement.decompose() return Chapter(title=titleElement.get_text().strip() if titleElement else None, content=Stringify(contentElement.encode_contents())) def _ScanWorks(self, URL: str) -> Optional[List[str]]: ## # # Scans a list of works: generates the list of story URLs. # # @param URL The URL. # # @return **None** when the scan fails, a list of story URLs when it doesn't fail. # ## # Check the arguments.
title = selectedChapterElement.text.strip() if title and (titleMatch := re.search("\d+\. (.*)", title)): title = titleMatch.group(1) # Read the content. storyTextElement = soup.find(id="storytext") if not storyTextElement: logging.error("Story text element not found.") return None # Create the Chapter and return it. return Chapter(title=title, content=Stringify(storyTextElement.encode_contents())) @staticmethod def _GetStoryID(URL: str) -> Optional[str]: if not URL: return None storyIDMatch = re.search("/s/(\d+)/", URL) if not storyIDMatch: return None return storyIDMatch.group(1) @staticmethod def _ReformatDate(date: str) -> Optional[str]:
class ExtractorHentaiFoundry(Extractor): def __init__(self) -> None: ## # # The constructor. # ## super().__init__() def GetSupportedHostnames(self) -> List[str]: ## # # Returns a list of hostnames supposed to be supported by the extractor. # # @return A list of supported hostnames. # ## return ["hentai-foundry.com"] def ScanChannel(self, URL: str) -> Optional[List[str]]: ## # # Scans the channel: generates the list of story URLs. # # @return **None** when the scan fails, a list of story URLs when it doesn't fail. # ## if (not URL) or (GetHostname(URL) not in self.GetSupportedHostnames()): return None usernameStoryIDMatch = re.search("/user/([a-zA-Z0-9_]+)/(\d+)", URL) if usernameStoryIDMatch: return None usernameMatch = re.search("/user/([a-zA-Z0-9_]+)", URL) if not usernameMatch: return None username = usernameMatch.group(1) normalizedURL = f"http://www.hentai-foundry.com/stories/user/{username}/" pageSoup = self._webSession.GetSoup(self._GetAdultView(normalizedURL)) if not pageSoup: return None pageCountDescriptionElement = pageSoup.select_one( ".galleryHeader > .summary") pageCountDescription = pageCountDescriptionElement.get_text().strip() pageCountDescriptionMatch = re.search( "Displaying (\d+)-(\d+) of (\d+) results", pageCountDescription) if not pageCountDescriptionMatch: logging.error("Failed to retrieve page count of the Stories tab.") return None storiesPerPage = int(pageCountDescriptionMatch.group(2)) storiesInTotal = int(pageCountDescriptionMatch.group(3)) if not storiesPerPage: return None pageCount = ceil(storiesInTotal / storiesPerPage) storyURLs = [] for pageIndex in range(1, pageCount + 1): pageURL = self._GetAdultView( f"http://www.hentai-foundry.com/stories/user/{username}?page={pageIndex}" ) pageSoup = self._webSession.GetSoup(pageURL) if not pageSoup: return None storyLinkElements = pageSoup.select( ".items > .storyRow > .titlebar > a") for linkElement in storyLinkElements: if not linkElement.has_attr("href"): continue storyURLs.append(self._baseURL + linkElement["href"]) return storyURLs def _InternallyScanStory(self, URL: str, soup: Optional[BeautifulSoup]) -> bool: ## # # Scans the story: generates the list of chapter URLs and retrieves the # metadata. # # @param URL The URL of the story. # @param soup The tag soup. # # @return **False** when the scan fails, **True** when it doesn't fail. # ## # Locate metadata. titleElement = soup.select_one(".titlebar a") if not titleElement: logging.error("Title element not found.") return False authorElement = soup.select_one(".storyInfo > .col1 > a") if not authorElement: logging.error("Author element not found.") return False datesElements = soup.select(".storyInfo > .col2 > .indent") if (not datesElements) or (len(datesElements) < 2): logging.error("Dates elements not found.") return False datePublishedElement = datesElements[0] dateUpdatedElement = datesElements[1] summaryElement = soup.select_one(".storyDescript") if not summaryElement: logging.error("Summary element not found.") return False chapterCountWordCountElement = soup.select_one(".storyInfo > .col3") if not chapterCountWordCountElement: logging.error("Chapter/word count elements not found.") return False # Extract and save metadata. self.Story.Metadata.Title = titleElement.get_text().strip() self.Story.Metadata.Author = authorElement.get_text().strip() rawDatePublished = datePublishedElement.get_text().strip() rawDateUpdated = dateUpdatedElement.get_text().strip() self.Story.Metadata.DatePublished = self._ReformatDate( rawDatePublished) self.Story.Metadata.DateUpdated = self._ReformatDate(rawDateUpdated) chapterCountWordCountDescription = StripHTML( chapterCountWordCountElement.get_text().strip()) chapterCountMatch = re.search("Chapters:\s+(\d+)", chapterCountWordCountDescription) if not chapterCountMatch: logging.error("Chapter count not found.") return False wordCountMatch = re.search("Words:\s+([0-9,]+)", chapterCountWordCountDescription) if not wordCountMatch: logging.error("Word count not found.") return False self.Story.Metadata.ChapterCount = int(chapterCountMatch.group(1)) self.Story.Metadata.WordCount = self._ReadWordCount( wordCountMatch.group(1)) self.Story.Metadata.Summary = StripHTML( summaryElement.get_text().strip()) # Retrieve chapter URLs. chapterLinkElements = soup.select(".boxbody > p > a") if not chapterLinkElements: logging.error("No chapter links found.") return False for linkElement in chapterLinkElements: if not linkElement.has_attr("href"): continue self._chapterURLs.append(self._baseURL + linkElement["href"]) # Return. return True def _InternallyExtractChapter( self, URL: str, soup: Optional[BeautifulSoup]) -> Optional[Chapter]: ## # # Extracts specific chapter. # # @param URL The URL of the page containing the chapter. # @param soup The tag soup of the page containing the chapter. # # @return **True** if the chapter is extracted correctly, **False** otherwise. # ## # Read the title. chapterTitle = None if (titleElement := soup.select_one("#viewChapter > .boxheader")): chapterTitle = titleElement.get_text().strip() # Read the content. storyTextElement = soup.select_one("#viewChapter > .boxbody") if not storyTextElement: logging.error("Story text element not found.") return None return Chapter(title=chapterTitle, content=Stringify(storyTextElement.encode_contents()))
def _ProcessURL(self, URL: str) -> Optional[Story]: ## # # Processes a URL, in text mode. # # @param URL The URL to be processed. # # @return The Story object if the URL has been processed successfully, **None** otherwise. # ## # Locate a working extractor. self._interface.Process("Creating the extractor...", section=True) extractor = CreateExtractor(URL) if not extractor: logging.error("No matching extractor found.") return None self._interface.Comment( f'Extractor created: "{type(extractor).__name__}".') # Authenticate the user (if supported by the extractor). if self._arguments.Authenticate and extractor.SupportsAuthentication(): self._interface.Process("Logging-in...", section=True) authenticationResult = extractor.Authenticate(self._interface) if Extractor.AuthenticationResult.FAILURE == authenticationResult: self._interface.Error("Failed to authenticate.") elif Extractor.AuthenticationResult.ABANDONED == authenticationResult: self._interface.Comment("Proceeding without logging-in...") else: self._interface.Comment("Authenticated successfully.") # Scan the story. self._interface.Process("Scanning the story...", section=True) if not extractor.ScanStory(): logging.error("Failed to scan the story.") return None self._PrintMetadata(extractor.Story) # Check whether the output files already exist. outputFilePaths = self._GetOutputPaths(self._arguments.Output, extractor.Story) if (not self._arguments.Force) and all( x.is_file() for x in outputFilePaths.values()): self._interface.Comment("This story has been downloaded already.", section=True) return True elif self._arguments.Force: [x.unlink() for x in outputFilePaths.values() if x.is_file()] # Extract content. self._interface.Process("Extracting content...", section=True) for index in range(1, extractor.Story.Metadata.ChapterCount + 1): # Generate cache identifiers. cacheOwnerName = extractor.Story.Metadata.URL cacheTitleName = f"{index}-Title" cacheContentName = f"{index}-Content" # Retrieve chapter data, either from cache or by downloading it. retrievedFromCache = False chapter = Chapter(title=Stringify( self._cache.RetrieveItem(cacheOwnerName, cacheTitleName)), content=Stringify( self._cache.RetrieveItem( cacheOwnerName, cacheContentName))) if chapter: retrievedFromCache = True else: chapter = extractor.ExtractChapter(index) if not chapter: if (1 != index) and (extractor.Story.Metadata.ChapterCount != index): logging.error("Failed to extract story content.") return None else: self._interface.Error( "Failed to extract the last chapter - it doesn't seem to exist." ) continue extractor.Story.Chapters.append(chapter) # Add the chapter to cache. if not retrievedFromCache: self._cache.AddItem(cacheOwnerName, cacheTitleName, chapter.Title) self._cache.AddItem(cacheOwnerName, cacheContentName, chapter.Content) # Notify the user, then sleep for a while. self._interface.ProgressBar( index, extractor.Story.Metadata.ChapterCount, Configuration.ProgressBarLength, f"# Extracted chapter {index}/{extractor.Story.Metadata.ChapterCount}", True) if extractor.Story.Metadata.ChapterCount == index: self._interface.EmptyLine() if not retrievedFromCache and extractor.RequiresBreaksBetweenRequests( ): sleep(Configuration.PostChapterSleepTime) # Locate and download images. if self._arguments.Images: self._interface.Process("Downloading images...", section=True) # Locate the images. for chapter in extractor.Story.Chapters: extractor.Story.Images.extend(FindImagesInCode( chapter.Content)) storySiteURL = GetSiteURL(extractor.Story.Metadata.URL) for image in extractor.Story.Images: image.URL = MakeURLAbsolute(image.URL, storySiteURL) self._interface.Comment( f"Found {len(extractor.Story.Images)} image(s).") # Download them. if extractor.Story.Images: imageCount = len(extractor.Story.Images) downloadedImageCount = 0 previousImageFailedToDownload = False for index, image in enumerate(extractor.Story.Images, start=1): retrievedFromCache = False imageData = self._cache.RetrieveItem( extractor.Story.Metadata.URL, image.URL) if not image.CreateFromData( imageData, Configuration.MaximumImageSideLength): imageData = extractor.ExtractMedia(image.URL) if imageData: image.CreateFromData( imageData, Configuration.MaximumImageSideLength) else: retrievedFromCache = True if image: if not retrievedFromCache: self._cache.AddItem(extractor.Story.Metadata.URL, image.URL, image.Data) self._interface.ProgressBar( index, imageCount, Configuration.ProgressBarLength, f"# Downloaded image {index}/{imageCount}", True) if imageCount == index: print() downloadedImageCount += 1 previousImageFailedToDownload = False else: if (index > 1) and (not previousImageFailedToDownload): print() errorMessage = \ f'Failed to download image {index}/{imageCount}: "{image.URL}".' \ if not imageData else \ f'Failed to process/re-encode image {index}/{imageCount}: "{image.URL}".' self._interface.Error(errorMessage) previousImageFailedToDownload = True self._interface.Comment( f"Successfully downloaded {downloadedImageCount}/{imageCount} image(s)." ) # Process content. self._interface.Process("Processing content...", section=True) extractor.Story.Process() for index, chapter in enumerate(extractor.Story.Chapters, start=1): # Store original content. if self._arguments.Debug: fileName = GetSanitizedFileName(f"{index} - Original.html") fileSubdirectoryName = GetSanitizedFileName( extractor.Story.Metadata.Title) WriteTextFile( Configuration.DebugDirectoryPath / fileSubdirectoryName / fileName, chapter.Content) # The sanitizer is used twice - once before any other processing, once after every other # processor. The first time is required to clean up the story (remove empty tags and tag # trees, for example), the second to guarantee that the story is actually sanitized. chapter.Content = SanitizerProcessor().Process(chapter.Content) chapter.Content = TypographyProcessor().Process(chapter.Content) chapter.Content = SanitizerProcessor().Process(chapter.Content) # Store processed content. if self._arguments.Debug: fileName = GetSanitizedFileName(f"{index} - Processed.html") fileSubdirectoryName = GetSanitizedFileName( extractor.Story.Metadata.Title) WriteTextFile( Configuration.DebugDirectoryPath / fileSubdirectoryName / fileName, chapter.Content) if not extractor.Story.Metadata.WordCount: extractor.Story.Metadata.WordCount = extractor.Story.CalculateWordCount( ) self._interface.Comment("Content processed.") # Return. return extractor.Story
class FormatterODT(Formatter): def __init__(self, embedImages: bool = True, combinedVersion: bool = False) -> None: ## # # The constructor. # # @param embedImages Embed images in the output file. # @param combinedVersion Use the template designed for multiple stories. # ## super().__init__(embedImages) # Intialize member variables. templateFileName = \ "Templates/FormatterODT/Template.odt" \ if not combinedVersion else \ "Templates/FormatterODT/Template (Combined).odt" self._templateFilePath = GetPackageDirectory() / templateFileName self._manifestDocument = "" self._contentDocument = "" self._metadataDocument = "" self._stylesDocument = "" # Load the template. with ZipFile(self._templateFilePath, "r") as archive: self._manifestDocument = Stringify( archive.read("META-INF/manifest.xml")) self._contentDocument = Stringify(archive.read("content.xml")) self._metadataDocument = Stringify(archive.read("meta.xml")) self._stylesDocument = Stringify(archive.read("styles.xml")) # Modify the styles. EOF = self._stylesDocument.find("</office:styles>") styles = ReadTextFile(GetPackageDirectory() / "Templates/FormatterODT/Styles.xml") self._stylesDocument = self._stylesDocument[: EOF] + styles + self._stylesDocument[ EOF:] def FormatAndSave(self, story: Union[Story, StoryPackage], filePath: Path) -> bool: ## # # Formats the story (or the story package) and saves it to the output file. # # @param story The story/story package to be formatted and saved. # @param filePath The path to the output file. # # @return **True** if the output file was generated and saved without problems, **False** # otherwise. # ## # Retrieve story content and translate it to ODT-compatible XML. def ChapterTitler(index: int, chapterTitle: str) -> str: return f"Chapter {index}" + (f": {chapterTitle}" if chapterTitle else "") def PackagePrefixer(index: int, chapterTitle: str, storyTitle: str) -> str: return f"<h1>{storyTitle} — {ChapterTitler(index, chapterTitle)}</h1>" def StoryPrefixer(index: int, chapterTitle: str, storyTitle: str) -> str: return f"<h1>{ChapterTitler(index, chapterTitle)}</h1>" prefixer = PackagePrefixer if isinstance( story, StoryPackage) else StoryPrefixer content = self._TranslateHTMLtoODT(story.Join(prefixer), story) content = StripEmptyTags( content, validEmptyTags=["draw:frame", "draw:image"], validEmptyTagAttributes={"text:style-name": "Horizontal_20_Line"}) # Prepare the files. manifestDocument = self._manifestDocument contentDocument = self._contentDocument metadataDocument = self._metadataDocument stylesDocument = self._stylesDocument # Modify the content. metadata = story.Metadata.GetPrettified(escapeHTMLEntities=True) contentDocument = story.FillTemplate(contentDocument, escapeHTMLEntities=True) contentDocument = contentDocument.replace("http://link.link/", metadata.URL) EOF = contentDocument.find("</office:text>") contentDocument = contentDocument[:EOF] + content + contentDocument[ EOF:] # Modify the metadata. EOF = "</office:meta>" metadataDocument = self._SetTagContent(metadataDocument, "dc:title", html.escape(metadata.Title), EOF) metadataDocument = self._SetTagContent(metadataDocument, "meta:initial-creator", html.escape(metadata.Author), EOF) metadataDocument = self._SetTagContent(metadataDocument, "dc:creator", html.escape(metadata.Author), EOF) # Modify the styles. stylesDocument = story.FillTemplate(stylesDocument, escapeHTMLEntities=True) # Modify the manifest. if self._embedImages: for index, image in enumerate(story.Images): if not image.Data: continue EOF = manifestDocument.find("</manifest:manifest>") manifestDocument = manifestDocument[:EOF] + \ '<manifest:file-entry manifest:full-path="Pictures/{}.jpeg"' \ ' manifest:media-type="image/jpeg"/>'.format(index) + \ manifestDocument[EOF:] # Save the output file. ReplacedFilesNames = [ "META-INF/manifest.xml", "content.xml", "meta.xml", "styles.xml" ] with ZipFile(filePath, mode="a") as outputArchive: with ZipFile(self._templateFilePath, "r") as archive: for item in [ x for x in archive.infolist() if x.filename not in ReplacedFilesNames ]: outputArchive.writestr(item, archive.read(item.filename)) outputArchive.writestr("META-INF/manifest.xml", manifestDocument) outputArchive.writestr("content.xml", contentDocument) outputArchive.writestr("meta.xml", metadataDocument) outputArchive.writestr("styles.xml", stylesDocument) if self._embedImages: for index, image in enumerate(story.Images): if not image: continue outputArchive.writestr(f"Pictures/{index}.jpeg", image.Data) # Return. return True @staticmethod def _SetTagContent(document: str, tagName: str, newValue: str, EOF: str) -> str: ## # # Replaces XML tag contents. # # @param document XML code. # @param tagName Tag name. # @param newValue Desired tag value. # @param EOF A string designating end-of-file. # # @return Modified code. # ## openingTag = f"<{tagName}>" closingTag = f"</{tagName}>" tagWithValue = f"{openingTag}{newValue}{closingTag}" if -1 != (tagPosition := document.find(openingTag)): document = re.sub(f"{openingTag}(.*){closingTag}", tagWithValue, document) elif -1 != (EOF := document.find(EOF)):
if not contentElement: logging.error("Content element not found.") return False if (element := contentElement.select_one("div#storyHeader")): element.decompose() if (element := contentElement.select_one("div#authorNotes")): element.decompose() for element in contentElement.select("form"): element.decompose() # Return. return Chapter(content=Stringify(contentElement.encode_contents())) @staticmethod def _GetStoryID(URL: str) -> Optional[str]: ## # # Retrieves story ID from story URL. # # @param URL The URL of the story. # # @return The ID of the story. Optionally **None**. # ## if not URL: