Example #1
0
    def ConvertFromEPUB(self, sourceFilePath: Path,
                        outputDirectoryPath: Path) -> bool:

        ##
        #
        # Converts an EPUB file to a MOBI file. The output file may exist: it will be overwritten if
        # it does.
        #
        # @param sourceFilePath      Path to the EPUB file.
        # @param outputDirectoryPath Path to the output directory. The output file will be created
        #                            inside it; its (base)name will be the same as the name of the
        #                            source file. The directory **has** to exist beforehand, this
        #                            method does *not* create it.
        #
        # @return **True** if the conversion has been performed successfully, **False** otherwise.
        #
        ##

        if not sourceFilePath.is_file():
            return False

        elif not outputDirectoryPath.is_dir():
            return False

        call([
            "ebook-convert",
            Stringify(sourceFilePath),
            Stringify(outputDirectoryPath / (sourceFilePath.stem + ".mobi")),
        ],
             stdout=DEVNULL,
             stderr=DEVNULL)

        return True
Example #2
0
    def __init__(self,
                 embedImages: bool = True,
                 combinedVersion: bool = False) -> None:

        ##
        #
        # The constructor.
        #
        # @param embedImages     Embed images in the output file.
        # @param combinedVersion Use the template designed for multiple stories.
        #
        ##

        super().__init__(embedImages)

        # Intialize member variables.

        templateFileName =                        \
            "Templates/FormatterODT/Template.odt" \
            if not combinedVersion else           \
            "Templates/FormatterODT/Template (Combined).odt"

        self._templateFilePath = GetPackageDirectory() / templateFileName

        self._manifestDocument = ""
        self._contentDocument = ""
        self._metadataDocument = ""
        self._stylesDocument = ""

        # Load the template.

        with ZipFile(self._templateFilePath, "r") as archive:

            self._manifestDocument = Stringify(
                archive.read("META-INF/manifest.xml"))
            self._contentDocument = Stringify(archive.read("content.xml"))
            self._metadataDocument = Stringify(archive.read("meta.xml"))
            self._stylesDocument = Stringify(archive.read("styles.xml"))

        # Modify the styles.

        EOF = self._stylesDocument.find("</office:styles>")

        styles = ReadTextFile(GetPackageDirectory() /
                              "Templates/FormatterODT/Styles.xml")
        self._stylesDocument = self._stylesDocument[:
                                                    EOF] + styles + self._stylesDocument[
                                                        EOF:]
    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        contentElements = soup.select("p")

        contentElements[0].decompose()
        contentElements[1].decompose()

        contentElements[-1].decompose()
        contentElements[-2].decompose()
        contentElements[-3].decompose()

        return Chapter(title=None, content=Stringify(soup))
    def _InternallyExtractChapter(
        self,
        URL: str,
        soup: Optional[BeautifulSoup]
    ) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Locate relevant page elements.

        titleElement = soup.select_one("h2#quizSubtitle")
        if not titleElement:
            logging.error("Title element not found.")
            return False

        contentElement = soup.select_one("#rescontent")
        if not contentElement:
            logging.error("Content element not found.")
            return False

        # Return.

        return Chapter(
            titleElement.get_text().strip(),
            Stringify(contentElement.encode_contents())
        )
Example #5
0
    def ExtractChapter(self, index: int) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param index The index of the chapter to be extracted.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        if 1 == self.Story.Metadata.ChapterCount:

            titleElement = None

            contentElement = self._storySoup.select_one(
                "div#chapters div.userstuff")
            if not contentElement:
                logging.error("Content element not found.")
                return None

            if (landmarkElement := contentElement.select_one("h3#work")):
                landmarkElement.decompose()

            return Chapter(title=titleElement.get_text().strip()
                           if titleElement else None,
                           content=Stringify(contentElement.encode_contents()))
Example #6
0
    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Extract the title.

        title = None

        titleElement = soup.select_one("p.highlighted-image__title > a")
        if titleElement:
            title = titleElement.get_text().strip()

        # Extract the content.

        contentElement = soup.select_one("div.storytext-container")
        if not contentElement:
            logging.error("Could find the content element.")
            return None

        # Return.

        return Chapter(title=title,
                       content=Stringify(contentElement.encode_contents()))
    def _InternallyExtractChapter(
        self,
        URL: str,
        soup: Optional[BeautifulSoup]
    ) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Locate relevant page elements.

        titleElement = soup.select_one("div#chapter-outer > div.caption > div > h4")
        # No error-checking here. Not sure if every chapter has to have a title on WW.

        contentElement = soup.select_one("div#chapter-content")
        if not contentElement:
            logging.error("Content element not found.")
            return False

        # Return.

        return Chapter(
            titleElement.get_text().strip() if titleElement else "",
            Stringify(contentElement.encode_contents())
        )
    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Extract the content.

        contentElement = soup.select_one("div#story")
        if not contentElement:
            logging.error("Couldn't find the content element.")
            return None

        # Return.

        return Chapter(title=self._chapterTitles[URL] if
                       (URL in self._chapterTitles) else None,
                       content=Stringify(contentElement.encode_contents()))
Example #9
0
    def ConvertFromODT(self, sourceFilePath: Path, outputDirectoryPath: Path,
                       converterFilePath: Path) -> bool:

        ##
        #
        # Converts an ODT file to a PDF file. The output file may exist: it will be overwritten if
        # it does.
        #
        # @param sourceFilePath      Path to the ODT file.
        # @param outputDirectoryPath Path to the output directory. The output file will be created
        #                            inside it; its (base)name will be the same as the name of the
        #                            source file. The directory **has** to exist beforehand, this
        #                            method does *not* create it.
        # @param converterFilePath   Path to the LibreOffice executable (soffice.exe/soffice).
        #
        # @return **True** if the conversion has been performed successfully, **False** otherwise.
        #
        ##

        if not sourceFilePath.is_file():
            return False

        elif not outputDirectoryPath.is_dir():
            return False

        elif not converterFilePath.is_file():
            return False

        call([
            Stringify(converterFilePath),
            "--headless",
            "--convert-to",
            "pdf",
            Stringify(sourceFilePath),
            "--outdir",
            Stringify(outputDirectoryPath),
        ],
             stdout=DEVNULL)

        return True
Example #10
0
    def Post(
        self,
        URL: str,
        payload,
        text: bool = True,
        textEncoding: str = DEFAULT_TEXT_ENCODING
    ) -> Optional[Union[bytes, str]]:

        ##
        #
        # Posts some data and receives the response.
        #
        # @param URL          The URL.
        # @param payload      The data to be posted.
        # @param text         Should the response be converted to text?
        # @param textEncoding The text encoding to be used during the conversion.
        #
        # @return Retrieved response (as *bytes* or *str*), or **None**.
        #
        ##

        # Prepare the headers.

        requestHeaders = {"User-Agent": self._userAgent}

        # Send the request.

        response = self._session.post(URL,
                                      headers=requestHeaders,
                                      data=payload)
        if (not response) or (200 != response.status_code):
            return None

        # Process the response.

        data = Stringify(response.content,
                         encoding=textEncoding) if text else response.content

        # Return.

        return data
Example #11
0
    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        rowElements = soup.select("div#contentdata > table > tr")
        if (not rowElements) or len(rowElements) < 3:
            logging.error("Chapter page doesn't conform to expected format.")

        return Chapter(title=None,
                       content=Stringify(rowElements[2].encode_contents()))
Example #12
0
    def Get(self,
            URL: str,
            text: bool = True,
            textEncoding: str = DEFAULT_TEXT_ENCODING,
            stream: bool = False) -> Optional[Union[bytes, str]]:

        ##
        #
        # Retrieves data using a GET request.
        #
        # @param URL          The URL.
        # @param text         Should the response be converted to text?
        # @param textEncoding The text encoding to be used during the conversion.
        # @param stream       Read data stream.
        #
        # @return Retrieved response (as *bytes* or *str*), or **None**.
        #
        ##

        # Prepare the headers.

        requestHeaders = {"User-Agent": self._userAgent}

        # Send the request.

        response = self._session.get(URL,
                                     headers=requestHeaders,
                                     stream=stream)
        if (not response) or (200 != response.status_code):
            return None

        # Process the response.

        data = Stringify(response.content,
                         encoding=textEncoding) if text else response.content

        # Return.

        return data
class ExtractorLiterotica(Extractor):
    def __init__(self) -> None:

        ##
        #
        # The constructor.
        #
        ##

        super().__init__()

    def GetSupportedHostnames(self) -> List[str]:

        ##
        #
        # Returns a list of hostnames supposed to be supported by the extractor.
        #
        # @return A list of supported hostnames.
        #
        ##

        return ["literotica.com"]

    def ScanChannel(self, URL: str) -> Optional[List[str]]:

        ##
        #
        # Scans the channel: generates the list of story URLs.
        #
        # @return **None** when the scan fails, a list of story URLs when it doesn't fail.
        #
        ##

        if (not URL) or (GetHostname(URL) not in self.GetSupportedHostnames()):
            return None

        # Download author's profile page.

        userIDMatch = re.search("\?uid\=(\d+)", URL)
        if not userIDMatch:
            return None

        userID = userIDMatch.group(1)
        userPageURL = f"{self.MEMBER_PAGE_URL}uid={userID}&page=submissions"

        soup = self._webSession.GetSoup(userPageURL)
        if not soup:
            return None

        # Locate all the stories.

        storyURLs = []

        storyHeaderElement = soup.select_one("tr.st-top")
        if not storyHeaderElement:
            return None

        storyRowElement = storyHeaderElement.next_sibling
        while storyRowElement:

            if not storyRowElement.has_attr("class"):
                break

            if "root-story" in storyRowElement["class"]:

                anchorElement = storyRowElement.select_one("a")
                if (not anchorElement) or (not anchorElement.has_attr("href")):
                    continue

                storyURLs.append(anchorElement["href"])
                storyRowElement = storyRowElement.next_sibling

            elif "ser-ttl" in storyRowElement["class"]:

                storyRowElement = storyRowElement.next_sibling
                if (not storyRowElement.has_attr("class")) or (
                        "sl" not in storyRowElement["class"]):
                    continue

                anchorElement = storyRowElement.select_one("a")
                if (not anchorElement) or (not anchorElement.has_attr("href")):
                    continue

                storyURLs.append(anchorElement["href"])
                storyRowElement = storyRowElement.next_sibling

            elif "sl" in storyRowElement["class"]:

                storyRowElement = storyRowElement.next_sibling

            else:

                break

        # Return.

        return storyURLs

    def _InternallyScanStory(self, URL: str,
                             soup: Optional[BeautifulSoup]) -> bool:

        ##
        #
        # Scans the story: generates the list of chapter URLs and retrieves the
        # metadata.
        #
        # @param URL  The URL of the story.
        # @param soup The tag soup.
        #
        # @return **False** when the scan fails, **True** when it doesn't fail.
        #
        ##

        # Extract basic metadata.

        titleElement = soup.select_one(
            "div.b-story-header > h1") or soup.select_one("h1.headline")
        if not titleElement:
            logging.error("Title element not found.")
            return False

        authorElement = soup.select_one(
            "div.b-story-header > span.b-story-user-y > a") or soup.select_one(
                "div.panel > div.y_eS > a.y_eU")
        if not authorElement:
            logging.error("Author element not found.")
            return False

        # Download the author's page.

        if not authorElement.has_attr("href"):
            logging.error("Can't find the URL of the author's page.")
            return False

        authorsPageURL = authorElement["href"]
        authorsPageSoup = self._webSession.GetSoup(authorsPageURL)
        if not authorsPageSoup:
            logging.error(f'Failed to download page: "{authorsPageURL}".')
            return False

        # Extract remaining metadata.

        storyRowElement = None

        for storyLinkElement in authorsPageSoup.select("td.fc > a"):
            if storyLinkElement.get_text().strip() == titleElement.get_text(
            ).strip():
                storyRowElement = storyLinkElement.parent.parent
                break

        if not storyRowElement:
            logging.error(
                "Failed to find the story's entry on the author's page.")
            return False

        storyMetadataElements = storyRowElement.find_all("td")
        if len(storyMetadataElements) < 4:
            logging.error("Can't extract metadata from the author's page.")
            return False

        summaryElement = storyMetadataElements[1]
        publishedElement = storyMetadataElements[3]

        # Prepare metadata.

        title = titleElement.get_text().strip()
        datePublished = self._ReformatDate(publishedElement.get_text().strip())
        dateUpdated = self._ReformatDate(publishedElement.get_text().strip())

        # Check if the story belongs to a series.

        seriesRowElement = None

        if storyRowElement.has_attr("class") and ("sl"
                                                  in storyRowElement["class"]):

            seriesRowElement = storyRowElement.find_previous_sibling(
                "tr", {"class": "ser-ttl"})

        if seriesRowElement:

            title = seriesRowElement.get_text().strip()
            chapterDates = []

            seriesChapterRowElement = seriesRowElement.next_sibling
            while seriesChapterRowElement:

                if (not seriesChapterRowElement.has_attr("class")) or (
                        "sl" not in seriesChapterRowElement["class"]):
                    break

                seriesChapterAnchorElement = seriesChapterRowElement.select_one(
                    "a")
                if (not seriesChapterAnchorElement) or (
                        not seriesChapterAnchorElement.has_attr("href")):
                    break

                seriesChapterCellElements = seriesChapterRowElement.select(
                    "td")
                if seriesChapterCellElements:
                    chapterDates.append(
                        seriesChapterCellElements[-1].get_text().strip())

                self._chapterURLs.append(seriesChapterAnchorElement["href"])
                seriesChapterRowElement = seriesChapterRowElement.next_sibling

            datePublished = self._ReformatDate(chapterDates[0])
            dateUpdated = self._ReformatDate(chapterDates[-1])

        else:

            self._chapterURLs = [self.Story.Metadata.URL]

        # Set the metadata.

        self.Story.Metadata.Title = title
        self.Story.Metadata.Author = authorElement.get_text().strip()

        self.Story.Metadata.DatePublished = datePublished
        self.Story.Metadata.DateUpdated = dateUpdated

        self.Story.Metadata.ChapterCount = len(self._chapterURLs)
        self.Story.Metadata.WordCount = 0

        self.Story.Metadata.Summary = StripHTML(
            summaryElement.get_text()).strip()

        # Return.

        return True

    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Find the page count of the story.

        pageCount = 1

        if (pageSelectElement := soup.find("select", {"name": "page"})):

            pageCount = len(pageSelectElement.find_all("option"))

            if pageCount < 1:

                logging.error("Failed to read the story's page count.")
                return None

        if (pageSelectElement := soup.select_one("div.panel.clearfix.l_bH")):

            if (linkElements := pageSelectElement.find_all("a")):

                pageCount = int(Stringify(linkElements[-1].encode_contents()))
            pageURL = URL + f"?page={pageIndex}"

            soup = self._webSession.GetSoup(pageURL)
            if not soup:
                logging.error(f'Failed to download page: "{pageURL}".')
                return None

            contentElement = soup.select_one(
                "div.b-story-body-x > div") or soup.select_one(
                    "div.panel.article")
            if not contentElement:
                logging.error("Story content element not found.")
                return None

            content += "<br/><br/>" + Stringify(
                contentElement.encode_contents())

        # Return.

        return Chapter(title=None, content=content)

    def _GetNormalizedStoryURL(self, URL: str) -> str:

        ##
        #
        # Returns a normalized story URL, i.e. one that can be used for anything.
        #
        # @param URL Input URL (given by the user).
        #
        # @return Normalized URL.
        #
Example #15
0
                    f"Only {len(chapterElements)} chapter(s) located. "
                    f"The story supposedly has {self.Story.Metadata.ChapterCount} chapter(s)."
                )
                return None

            currentChapterElement = chapterElements[index - 1]

            titleElement = currentChapterElement.select_one("h3.title")
            contentElement = currentChapterElement.select_one("div.userstuff")

            if (landmarkElement := contentElement.select_one("h3#work")):
                landmarkElement.decompose()

            return Chapter(title=titleElement.get_text().strip()
                           if titleElement else None,
                           content=Stringify(contentElement.encode_contents()))

    def _ScanWorks(self, URL: str) -> Optional[List[str]]:

        ##
        #
        # Scans a list of works: generates the list of story URLs.
        #
        # @param URL The URL.
        #
        # @return **None** when the scan fails, a list of story URLs when it doesn't fail.
        #
        ##

        # Check the arguments.
Example #16
0
            title = selectedChapterElement.text.strip()

        if title and (titleMatch := re.search("\d+\. (.*)", title)):
            title = titleMatch.group(1)

        # Read the content.

        storyTextElement = soup.find(id="storytext")
        if not storyTextElement:
            logging.error("Story text element not found.")
            return None

        # Create the Chapter and return it.

        return Chapter(title=title,
                       content=Stringify(storyTextElement.encode_contents()))

    @staticmethod
    def _GetStoryID(URL: str) -> Optional[str]:

        if not URL:
            return None

        storyIDMatch = re.search("/s/(\d+)/", URL)
        if not storyIDMatch:
            return None

        return storyIDMatch.group(1)

    @staticmethod
    def _ReformatDate(date: str) -> Optional[str]:
class ExtractorHentaiFoundry(Extractor):
    def __init__(self) -> None:

        ##
        #
        # The constructor.
        #
        ##

        super().__init__()

    def GetSupportedHostnames(self) -> List[str]:

        ##
        #
        # Returns a list of hostnames supposed to be supported by the extractor.
        #
        # @return A list of supported hostnames.
        #
        ##

        return ["hentai-foundry.com"]

    def ScanChannel(self, URL: str) -> Optional[List[str]]:

        ##
        #
        # Scans the channel: generates the list of story URLs.
        #
        # @return **None** when the scan fails, a list of story URLs when it doesn't fail.
        #
        ##

        if (not URL) or (GetHostname(URL) not in self.GetSupportedHostnames()):
            return None

        usernameStoryIDMatch = re.search("/user/([a-zA-Z0-9_]+)/(\d+)", URL)
        if usernameStoryIDMatch:
            return None

        usernameMatch = re.search("/user/([a-zA-Z0-9_]+)", URL)
        if not usernameMatch:
            return None

        username = usernameMatch.group(1)
        normalizedURL = f"http://www.hentai-foundry.com/stories/user/{username}/"

        pageSoup = self._webSession.GetSoup(self._GetAdultView(normalizedURL))
        if not pageSoup:
            return None

        pageCountDescriptionElement = pageSoup.select_one(
            ".galleryHeader > .summary")
        pageCountDescription = pageCountDescriptionElement.get_text().strip()

        pageCountDescriptionMatch = re.search(
            "Displaying (\d+)-(\d+) of (\d+) results", pageCountDescription)

        if not pageCountDescriptionMatch:
            logging.error("Failed to retrieve page count of the Stories tab.")
            return None

        storiesPerPage = int(pageCountDescriptionMatch.group(2))
        storiesInTotal = int(pageCountDescriptionMatch.group(3))

        if not storiesPerPage:
            return None

        pageCount = ceil(storiesInTotal / storiesPerPage)

        storyURLs = []
        for pageIndex in range(1, pageCount + 1):

            pageURL = self._GetAdultView(
                f"http://www.hentai-foundry.com/stories/user/{username}?page={pageIndex}"
            )

            pageSoup = self._webSession.GetSoup(pageURL)
            if not pageSoup:
                return None

            storyLinkElements = pageSoup.select(
                ".items > .storyRow > .titlebar > a")

            for linkElement in storyLinkElements:

                if not linkElement.has_attr("href"):
                    continue

                storyURLs.append(self._baseURL + linkElement["href"])

        return storyURLs

    def _InternallyScanStory(self, URL: str,
                             soup: Optional[BeautifulSoup]) -> bool:

        ##
        #
        # Scans the story: generates the list of chapter URLs and retrieves the
        # metadata.
        #
        # @param URL  The URL of the story.
        # @param soup The tag soup.
        #
        # @return **False** when the scan fails, **True** when it doesn't fail.
        #
        ##

        # Locate metadata.

        titleElement = soup.select_one(".titlebar a")
        if not titleElement:
            logging.error("Title element not found.")
            return False

        authorElement = soup.select_one(".storyInfo > .col1 > a")
        if not authorElement:
            logging.error("Author element not found.")
            return False

        datesElements = soup.select(".storyInfo > .col2 > .indent")
        if (not datesElements) or (len(datesElements) < 2):
            logging.error("Dates elements not found.")
            return False

        datePublishedElement = datesElements[0]
        dateUpdatedElement = datesElements[1]

        summaryElement = soup.select_one(".storyDescript")
        if not summaryElement:
            logging.error("Summary element not found.")
            return False

        chapterCountWordCountElement = soup.select_one(".storyInfo > .col3")
        if not chapterCountWordCountElement:
            logging.error("Chapter/word count elements not found.")
            return False

        # Extract and save metadata.

        self.Story.Metadata.Title = titleElement.get_text().strip()
        self.Story.Metadata.Author = authorElement.get_text().strip()

        rawDatePublished = datePublishedElement.get_text().strip()
        rawDateUpdated = dateUpdatedElement.get_text().strip()

        self.Story.Metadata.DatePublished = self._ReformatDate(
            rawDatePublished)
        self.Story.Metadata.DateUpdated = self._ReformatDate(rawDateUpdated)

        chapterCountWordCountDescription = StripHTML(
            chapterCountWordCountElement.get_text().strip())
        chapterCountMatch = re.search("Chapters:\s+(\d+)",
                                      chapterCountWordCountDescription)
        if not chapterCountMatch:
            logging.error("Chapter count not found.")
            return False

        wordCountMatch = re.search("Words:\s+([0-9,]+)",
                                   chapterCountWordCountDescription)
        if not wordCountMatch:
            logging.error("Word count not found.")
            return False

        self.Story.Metadata.ChapterCount = int(chapterCountMatch.group(1))
        self.Story.Metadata.WordCount = self._ReadWordCount(
            wordCountMatch.group(1))

        self.Story.Metadata.Summary = StripHTML(
            summaryElement.get_text().strip())

        # Retrieve chapter URLs.

        chapterLinkElements = soup.select(".boxbody > p > a")
        if not chapterLinkElements:
            logging.error("No chapter links found.")
            return False

        for linkElement in chapterLinkElements:

            if not linkElement.has_attr("href"):
                continue

            self._chapterURLs.append(self._baseURL + linkElement["href"])

        # Return.

        return True

    def _InternallyExtractChapter(
            self, URL: str,
            soup: Optional[BeautifulSoup]) -> Optional[Chapter]:

        ##
        #
        # Extracts specific chapter.
        #
        # @param URL  The URL of the page containing the chapter.
        # @param soup The tag soup of the page containing the chapter.
        #
        # @return **True** if the chapter is extracted correctly, **False** otherwise.
        #
        ##

        # Read the title.

        chapterTitle = None

        if (titleElement := soup.select_one("#viewChapter > .boxheader")):

            chapterTitle = titleElement.get_text().strip()

        # Read the content.

        storyTextElement = soup.select_one("#viewChapter > .boxbody")
        if not storyTextElement:
            logging.error("Story text element not found.")
            return None

        return Chapter(title=chapterTitle,
                       content=Stringify(storyTextElement.encode_contents()))
Example #18
0
    def _ProcessURL(self, URL: str) -> Optional[Story]:

        ##
        #
        # Processes a URL, in text mode.
        #
        # @param URL The URL to be processed.
        #
        # @return The Story object if the URL has been processed successfully, **None** otherwise.
        #
        ##

        # Locate a working extractor.

        self._interface.Process("Creating the extractor...", section=True)

        extractor = CreateExtractor(URL)
        if not extractor:
            logging.error("No matching extractor found.")
            return None

        self._interface.Comment(
            f'Extractor created: "{type(extractor).__name__}".')

        # Authenticate the user (if supported by the extractor).

        if self._arguments.Authenticate and extractor.SupportsAuthentication():

            self._interface.Process("Logging-in...", section=True)

            authenticationResult = extractor.Authenticate(self._interface)

            if Extractor.AuthenticationResult.FAILURE == authenticationResult:
                self._interface.Error("Failed to authenticate.")
            elif Extractor.AuthenticationResult.ABANDONED == authenticationResult:
                self._interface.Comment("Proceeding without logging-in...")
            else:
                self._interface.Comment("Authenticated successfully.")

        # Scan the story.

        self._interface.Process("Scanning the story...", section=True)

        if not extractor.ScanStory():
            logging.error("Failed to scan the story.")
            return None

        self._PrintMetadata(extractor.Story)

        # Check whether the output files already exist.

        outputFilePaths = self._GetOutputPaths(self._arguments.Output,
                                               extractor.Story)

        if (not self._arguments.Force) and all(
                x.is_file() for x in outputFilePaths.values()):
            self._interface.Comment("This story has been downloaded already.",
                                    section=True)
            return True

        elif self._arguments.Force:
            [x.unlink() for x in outputFilePaths.values() if x.is_file()]

        # Extract content.

        self._interface.Process("Extracting content...", section=True)

        for index in range(1, extractor.Story.Metadata.ChapterCount + 1):

            # Generate cache identifiers.

            cacheOwnerName = extractor.Story.Metadata.URL
            cacheTitleName = f"{index}-Title"
            cacheContentName = f"{index}-Content"

            # Retrieve chapter data, either from cache or by downloading it.

            retrievedFromCache = False

            chapter = Chapter(title=Stringify(
                self._cache.RetrieveItem(cacheOwnerName, cacheTitleName)),
                              content=Stringify(
                                  self._cache.RetrieveItem(
                                      cacheOwnerName, cacheContentName)))

            if chapter:

                retrievedFromCache = True

            else:

                chapter = extractor.ExtractChapter(index)

                if not chapter:

                    if (1 != index) and (extractor.Story.Metadata.ChapterCount
                                         != index):
                        logging.error("Failed to extract story content.")
                        return None

                    else:
                        self._interface.Error(
                            "Failed to extract the last chapter - it doesn't seem to exist."
                        )
                        continue

            extractor.Story.Chapters.append(chapter)

            # Add the chapter to cache.

            if not retrievedFromCache:
                self._cache.AddItem(cacheOwnerName, cacheTitleName,
                                    chapter.Title)
                self._cache.AddItem(cacheOwnerName, cacheContentName,
                                    chapter.Content)

            # Notify the user, then sleep for a while.

            self._interface.ProgressBar(
                index, extractor.Story.Metadata.ChapterCount,
                Configuration.ProgressBarLength,
                f"# Extracted chapter {index}/{extractor.Story.Metadata.ChapterCount}",
                True)

            if extractor.Story.Metadata.ChapterCount == index:
                self._interface.EmptyLine()

            if not retrievedFromCache and extractor.RequiresBreaksBetweenRequests(
            ):
                sleep(Configuration.PostChapterSleepTime)

        # Locate and download images.

        if self._arguments.Images:

            self._interface.Process("Downloading images...", section=True)

            # Locate the images.

            for chapter in extractor.Story.Chapters:
                extractor.Story.Images.extend(FindImagesInCode(
                    chapter.Content))

            storySiteURL = GetSiteURL(extractor.Story.Metadata.URL)
            for image in extractor.Story.Images:
                image.URL = MakeURLAbsolute(image.URL, storySiteURL)

            self._interface.Comment(
                f"Found {len(extractor.Story.Images)} image(s).")

            # Download them.

            if extractor.Story.Images:

                imageCount = len(extractor.Story.Images)
                downloadedImageCount = 0

                previousImageFailedToDownload = False

                for index, image in enumerate(extractor.Story.Images, start=1):

                    retrievedFromCache = False
                    imageData = self._cache.RetrieveItem(
                        extractor.Story.Metadata.URL, image.URL)

                    if not image.CreateFromData(
                            imageData, Configuration.MaximumImageSideLength):

                        imageData = extractor.ExtractMedia(image.URL)

                        if imageData:
                            image.CreateFromData(
                                imageData,
                                Configuration.MaximumImageSideLength)

                    else:

                        retrievedFromCache = True

                    if image:

                        if not retrievedFromCache:
                            self._cache.AddItem(extractor.Story.Metadata.URL,
                                                image.URL, image.Data)

                        self._interface.ProgressBar(
                            index, imageCount, Configuration.ProgressBarLength,
                            f"# Downloaded image {index}/{imageCount}", True)

                        if imageCount == index:
                            print()

                        downloadedImageCount += 1
                        previousImageFailedToDownload = False

                    else:

                        if (index > 1) and (not previousImageFailedToDownload):
                            print()

                        errorMessage =                                                       \
                            f'Failed to download image {index}/{imageCount}: "{image.URL}".' \
                            if not imageData else                                            \
                            f'Failed to process/re-encode image {index}/{imageCount}: "{image.URL}".'

                        self._interface.Error(errorMessage)

                        previousImageFailedToDownload = True

                self._interface.Comment(
                    f"Successfully downloaded {downloadedImageCount}/{imageCount} image(s)."
                )

        # Process content.

        self._interface.Process("Processing content...", section=True)

        extractor.Story.Process()

        for index, chapter in enumerate(extractor.Story.Chapters, start=1):

            # Store original content.

            if self._arguments.Debug:

                fileName = GetSanitizedFileName(f"{index} - Original.html")
                fileSubdirectoryName = GetSanitizedFileName(
                    extractor.Story.Metadata.Title)

                WriteTextFile(
                    Configuration.DebugDirectoryPath / fileSubdirectoryName /
                    fileName, chapter.Content)

            # The sanitizer is used twice - once before any other processing, once after every other
            # processor. The first time is required to clean up the story (remove empty tags and tag
            # trees, for example), the second to guarantee that the story is actually sanitized.

            chapter.Content = SanitizerProcessor().Process(chapter.Content)
            chapter.Content = TypographyProcessor().Process(chapter.Content)
            chapter.Content = SanitizerProcessor().Process(chapter.Content)

            # Store processed content.

            if self._arguments.Debug:

                fileName = GetSanitizedFileName(f"{index} - Processed.html")
                fileSubdirectoryName = GetSanitizedFileName(
                    extractor.Story.Metadata.Title)

                WriteTextFile(
                    Configuration.DebugDirectoryPath / fileSubdirectoryName /
                    fileName, chapter.Content)

        if not extractor.Story.Metadata.WordCount:
            extractor.Story.Metadata.WordCount = extractor.Story.CalculateWordCount(
            )

        self._interface.Comment("Content processed.")

        # Return.

        return extractor.Story
Example #19
0
class FormatterODT(Formatter):
    def __init__(self,
                 embedImages: bool = True,
                 combinedVersion: bool = False) -> None:

        ##
        #
        # The constructor.
        #
        # @param embedImages     Embed images in the output file.
        # @param combinedVersion Use the template designed for multiple stories.
        #
        ##

        super().__init__(embedImages)

        # Intialize member variables.

        templateFileName =                        \
            "Templates/FormatterODT/Template.odt" \
            if not combinedVersion else           \
            "Templates/FormatterODT/Template (Combined).odt"

        self._templateFilePath = GetPackageDirectory() / templateFileName

        self._manifestDocument = ""
        self._contentDocument = ""
        self._metadataDocument = ""
        self._stylesDocument = ""

        # Load the template.

        with ZipFile(self._templateFilePath, "r") as archive:

            self._manifestDocument = Stringify(
                archive.read("META-INF/manifest.xml"))
            self._contentDocument = Stringify(archive.read("content.xml"))
            self._metadataDocument = Stringify(archive.read("meta.xml"))
            self._stylesDocument = Stringify(archive.read("styles.xml"))

        # Modify the styles.

        EOF = self._stylesDocument.find("</office:styles>")

        styles = ReadTextFile(GetPackageDirectory() /
                              "Templates/FormatterODT/Styles.xml")
        self._stylesDocument = self._stylesDocument[:
                                                    EOF] + styles + self._stylesDocument[
                                                        EOF:]

    def FormatAndSave(self, story: Union[Story, StoryPackage],
                      filePath: Path) -> bool:

        ##
        #
        # Formats the story (or the story package) and saves it to the output file.
        #
        # @param story    The story/story package to be formatted and saved.
        # @param filePath The path to the output file.
        #
        # @return **True** if the output file was generated and saved without problems, **False**
        #         otherwise.
        #
        ##

        # Retrieve story content and translate it to ODT-compatible XML.

        def ChapterTitler(index: int, chapterTitle: str) -> str:
            return f"Chapter {index}" + (f": {chapterTitle}"
                                         if chapterTitle else "")

        def PackagePrefixer(index: int, chapterTitle: str,
                            storyTitle: str) -> str:
            return f"<h1>{storyTitle} — {ChapterTitler(index, chapterTitle)}</h1>"

        def StoryPrefixer(index: int, chapterTitle: str,
                          storyTitle: str) -> str:
            return f"<h1>{ChapterTitler(index, chapterTitle)}</h1>"

        prefixer = PackagePrefixer if isinstance(
            story, StoryPackage) else StoryPrefixer

        content = self._TranslateHTMLtoODT(story.Join(prefixer), story)
        content = StripEmptyTags(
            content,
            validEmptyTags=["draw:frame", "draw:image"],
            validEmptyTagAttributes={"text:style-name": "Horizontal_20_Line"})

        # Prepare the files.

        manifestDocument = self._manifestDocument
        contentDocument = self._contentDocument
        metadataDocument = self._metadataDocument
        stylesDocument = self._stylesDocument

        # Modify the content.

        metadata = story.Metadata.GetPrettified(escapeHTMLEntities=True)

        contentDocument = story.FillTemplate(contentDocument,
                                             escapeHTMLEntities=True)
        contentDocument = contentDocument.replace("http://link.link/",
                                                  metadata.URL)

        EOF = contentDocument.find("</office:text>")
        contentDocument = contentDocument[:EOF] + content + contentDocument[
            EOF:]

        # Modify the metadata.

        EOF = "</office:meta>"

        metadataDocument = self._SetTagContent(metadataDocument, "dc:title",
                                               html.escape(metadata.Title),
                                               EOF)

        metadataDocument = self._SetTagContent(metadataDocument,
                                               "meta:initial-creator",
                                               html.escape(metadata.Author),
                                               EOF)

        metadataDocument = self._SetTagContent(metadataDocument, "dc:creator",
                                               html.escape(metadata.Author),
                                               EOF)

        # Modify the styles.

        stylesDocument = story.FillTemplate(stylesDocument,
                                            escapeHTMLEntities=True)

        # Modify the manifest.

        if self._embedImages:

            for index, image in enumerate(story.Images):

                if not image.Data:
                    continue

                EOF = manifestDocument.find("</manifest:manifest>")
                manifestDocument = manifestDocument[:EOF] + \
                    '<manifest:file-entry manifest:full-path="Pictures/{}.jpeg"' \
                    ' manifest:media-type="image/jpeg"/>'.format(index) + \
                    manifestDocument[EOF:]

        # Save the output file.

        ReplacedFilesNames = [
            "META-INF/manifest.xml", "content.xml", "meta.xml", "styles.xml"
        ]

        with ZipFile(filePath, mode="a") as outputArchive:

            with ZipFile(self._templateFilePath, "r") as archive:

                for item in [
                        x for x in archive.infolist()
                        if x.filename not in ReplacedFilesNames
                ]:
                    outputArchive.writestr(item, archive.read(item.filename))

            outputArchive.writestr("META-INF/manifest.xml", manifestDocument)
            outputArchive.writestr("content.xml", contentDocument)
            outputArchive.writestr("meta.xml", metadataDocument)
            outputArchive.writestr("styles.xml", stylesDocument)

            if self._embedImages:

                for index, image in enumerate(story.Images):

                    if not image:
                        continue

                    outputArchive.writestr(f"Pictures/{index}.jpeg",
                                           image.Data)

        # Return.

        return True

    @staticmethod
    def _SetTagContent(document: str, tagName: str, newValue: str,
                       EOF: str) -> str:

        ##
        #
        # Replaces XML tag contents.
        #
        # @param document XML code.
        # @param tagName Tag name.
        # @param newValue Desired tag value.
        # @param EOF A string designating end-of-file.
        #
        # @return Modified code.
        #
        ##

        openingTag = f"<{tagName}>"
        closingTag = f"</{tagName}>"
        tagWithValue = f"{openingTag}{newValue}{closingTag}"

        if -1 != (tagPosition := document.find(openingTag)):

            document = re.sub(f"{openingTag}(.*){closingTag}", tagWithValue,
                              document)

        elif -1 != (EOF := document.find(EOF)):
Example #20
0
        if not contentElement:
            logging.error("Content element not found.")
            return False

        if (element := contentElement.select_one("div#storyHeader")):
            element.decompose()

        if (element := contentElement.select_one("div#authorNotes")):
            element.decompose()

        for element in contentElement.select("form"):
            element.decompose()

        # Return.

        return Chapter(content=Stringify(contentElement.encode_contents()))

    @staticmethod
    def _GetStoryID(URL: str) -> Optional[str]:

        ##
        #
        # Retrieves story ID from story URL.
        #
        # @param URL The URL of the story.
        #
        # @return The ID of the story. Optionally **None**.
        #
        ##

        if not URL: