Python htmlCleanerの例、polyglot.htmlCleaner Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ebook.py プロジェクト: thespacedoctor/simpdf

    def _url_to_epub(
            self):
        """*generate the epub book from a URL*
        """
        self.log.info('starting the ``_url_to_epub`` method')

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=self.log,
            settings=self.settings,
            url=self.urlOrPath,
            outputDirectory=self.outputDirectory,
            title=self.title,  # SET TO FALSE TO USE WEBPAGE TITLE,
            style=False,  # add simpdf's styling to the HTML document
            metadata=True,  # include metadata in generated HTML (e.g. title),
            h1=False  # include title as H1 at the top of the doc
        )
        html = cleaner.clean()

        if not html:
            return None

        if self.footer:
            footer = self._tmp_html_file(self.footer)
            footer = '"%(footer)s"' % locals()
        else:
            footer = ""

        if self.header:
            header = self._tmp_html_file(self.header)
            header = '"%(header)s"' % locals()
        else:
            header = ""

        # HTML SOURCE FILE
        epub = html.replace(".html", ".epub")
        pandoc = self.settings["executables"]["pandoc"]

        cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s "%(html)s" %(footer)s -o "%(epub)s" """ % locals(
        )
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())

        try:
            with open(epub):
                pass
            fileExists = True
        except IOError:
            fileExists = False
            raise IOError(
                "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr))

        os.remove(html)

        self.log.info('completed the ``_url_to_epub`` method')
        return epub

コード例 #2

0

ファイルを表示

    def _url_to_epub(self):
        """*generate the epub book from a URL*
        """
        self.log.debug('starting the ``_url_to_epub`` method')

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=self.log,
            settings=self.settings,
            url=self.urlOrPath,
            outputDirectory=self.outputDirectory,
            title=self.title,  # SET TO FALSE TO USE WEBPAGE TITLE,
            style=False,  # add simpdf's styling to the HTML document
            metadata=True,  # include metadata in generated HTML (e.g. title),
            h1=False  # include title as H1 at the top of the doc
        )
        html = cleaner.clean()

        if not html:
            return None

        if self.footer:
            footer = self._tmp_html_file(self.footer)
            footer = '"%(footer)s"' % locals()
        else:
            footer = ""

        if self.header:
            header = self._tmp_html_file(self.header)
            header = '"%(header)s"' % locals()
        else:
            header = ""

        # HTML SOURCE FILE
        epub = html.replace(".html", ".epub")
        pandoc = self.settings["executables"]["pandoc"]

        cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals(
        )
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())

        try:
            with open(epub):
                pass
            fileExists = True
        except IOError:
            fileExists = False
            raise IOError(
                "the epub %s does not exist on this machine, here is the failure message: %s"
                % (epub, stderr))

        os.remove(html)

        self.log.debug('completed the ``_url_to_epub`` method')
        return epub

コード例 #3

0

ファイルを表示

ファイル: test_htmlCleaner.py プロジェクト: thespacedoctor/simpdf

    def test_htmlCleaner_function(self):

        from polyglot import htmlCleaner
        this = htmlCleaner(
            log=log,
            settings=settings,
            url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
            outputDirectory=pathToOutputDir,
            title=False
        )
        this.clean()

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=log,
            settings=settings,
            url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
            outputDirectory=pathToOutputDir,
            title="my_clean_doc.html"
        )
        cleaner.clean()

コード例 #4

0

ファイルを表示

ファイル: test_htmlCleaner.py プロジェクト: zaork/polyglot

    def test_htmlCleaner_function_exception(self):

        from polyglot import htmlCleaner
        try:
            this = htmlCleaner(log=log,
                               settings=settings,
                               fakeKey="break the code")
            this.get()
            assert False
        except Exception, e:
            assert True
            print str(e)

コード例 #5

0

ファイルを表示

ファイル: test_htmlCleaner.py プロジェクト: zaork/polyglot

    def test_htmlCleaner_function(self):

        from polyglot import htmlCleaner
        this = htmlCleaner(
            log=log,
            settings=settings,
            url=
            "http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
            outputDirectory=pathToOutputDir,
            title=False)
        this.clean()

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=log,
            settings=settings,
            url=
            "http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
            outputDirectory=pathToOutputDir,
            title="my_clean_doc.html")
        cleaner.clean()

コード例 #6

0

ファイルを表示

ファイル: test_htmlCleaner.py プロジェクト: thespacedoctor/simpdf

    def test_htmlCleaner_function_exception(self):

        from polyglot import htmlCleaner
        try:
            this = htmlCleaner(
                log=log,
                settings=settings,
                fakeKey="break the code"
            )
            this.get()
            assert False
        except Exception, e:
            assert True
            print str(e)

コード例 #7

0

ファイルを表示

ファイル: printpdf.py プロジェクト: thespacedoctor/simpdf

    def _print_original_webpage(
            self):
        """*print the original webpage*

        **Return:**
            - ``pdfPath`` -- the path to the generated PDF
        """
        self.log.info('starting the ``_print_original_webpage`` method')

        if not self.title:
            from polyglot import htmlCleaner
            cleaner = htmlCleaner(
                log=self.log,
                settings=self.settings,
                url=self.url,
                outputDirectory=self.folderpath,
                title=self.title,  # SET TO FALSE TO USE WEBPAGE TITLE,
                style=True,  # add polyglot's styling to the HTML document
                # include metadata in generated HTML (e.g. title),
                metadata=True,
                h1=True  # include title as H1 at the top of the doc
            )
            htmlFile = cleaner.clean()
            basename = os.path.basename(htmlFile)
            title = basename.replace(".html", "")
            os.remove(htmlFile)
        else:
            title = self.title

        # CONVERT TO PDF WITH ELECTON PDF

        url = self.url
        pdfPath = self.folderpath + "/" + title + self.append + ".pdf"
        electron = self.settings["executables"]["electron path"]
        cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals()
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())
        if len(stderr):
            print stderr

        exists = os.path.exists(pdfPath)
        if not exists:
            print "%(pdfPath)s was not generated for some reason - please investigate" % locals()
            sys.exit(0)

        self.log.info('completed the ``_print_original_webpage`` method')
        return pdfPath

コード例 #8

0

ファイルを表示

ファイル: printpdf.py プロジェクト: krystofl/polyglot

    def _print_parsed_webpage(self):
        """*print the parsed/cleaned webpage*

        **Return:**
            - ``pdfPath`` -- the path to the generated PDF
        """
        self.log.debug('starting the ``_print_parsed_webpage()`` method')

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=self.log,
            settings=self.settings,
            url=self.url,
            outputDirectory=self.folderpath,
            title=self.title,  # SET TO FALSE TO USE WEBPAGE TITLE,
            style=True,  # add polyglot's styling to the HTML document
            metadata=True,  # include metadata in generated HTML (e.g. title),
            h1=True  # include title as H1 at the top of the doc
        )
        htmlFile = cleaner.clean()
        if not htmlFile:
            return

        pdfPath = htmlFile.replace(".html", self.append + ".pdf")

        # CONVERT TO PDF WITH ELECTON PDF
        electron = self.settings["executables"]["electron path"]
        cmd = """%(electron)s -i "%(htmlFile)s" -o "%(pdfPath)s" """ % locals()
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        if len(stderr):
            print stderr
        self.log.debug('output: %(stdout)s' % locals())

        # REMOVE HTML FILE
        os.remove(htmlFile)

        exists = os.path.exists(pdfPath)
        if not exists:
            print "%(pdfPath)s was not generated for some reason - please investigate" % locals(
            )
            sys.exit(0)

        self.log.debug('completed the ``_print_parsed_webpage()`` method')
        return pdfPath