def _url_to_epub( self): """*generate the epub book from a URL* """ self.log.info('starting the ``_url_to_epub`` method') from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.urlOrPath, outputDirectory=self.outputDirectory, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=False, # add simpdf's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=False # include title as H1 at the top of the doc ) html = cleaner.clean() if not html: return None if self.footer: footer = self._tmp_html_file(self.footer) footer = '"%(footer)s"' % locals() else: footer = "" if self.header: header = self._tmp_html_file(self.header) header = '"%(header)s"' % locals() else: header = "" # HTML SOURCE FILE epub = html.replace(".html", ".epub") pandoc = self.settings["executables"]["pandoc"] cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s "%(html)s" %(footer)s -o "%(epub)s" """ % locals( ) p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: with open(epub): pass fileExists = True except IOError: fileExists = False raise IOError( "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr)) os.remove(html) self.log.info('completed the ``_url_to_epub`` method') return epub
def _url_to_epub(self): """*generate the epub book from a URL* """ self.log.debug('starting the ``_url_to_epub`` method') from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.urlOrPath, outputDirectory=self.outputDirectory, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=False, # add simpdf's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=False # include title as H1 at the top of the doc ) html = cleaner.clean() if not html: return None if self.footer: footer = self._tmp_html_file(self.footer) footer = '"%(footer)s"' % locals() else: footer = "" if self.header: header = self._tmp_html_file(self.header) header = '"%(header)s"' % locals() else: header = "" # HTML SOURCE FILE epub = html.replace(".html", ".epub") pandoc = self.settings["executables"]["pandoc"] cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals( ) p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: with open(epub): pass fileExists = True except IOError: fileExists = False raise IOError( "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr)) os.remove(html) self.log.debug('completed the ``_url_to_epub`` method') return epub
def test_htmlCleaner_function(self): from polyglot import htmlCleaner this = htmlCleaner( log=log, settings=settings, url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", outputDirectory=pathToOutputDir, title=False ) this.clean() from polyglot import htmlCleaner cleaner = htmlCleaner( log=log, settings=settings, url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", outputDirectory=pathToOutputDir, title="my_clean_doc.html" ) cleaner.clean()
def test_htmlCleaner_function_exception(self): from polyglot import htmlCleaner try: this = htmlCleaner(log=log, settings=settings, fakeKey="break the code") this.get() assert False except Exception, e: assert True print str(e)
def test_htmlCleaner_function(self): from polyglot import htmlCleaner this = htmlCleaner( log=log, settings=settings, url= "http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", outputDirectory=pathToOutputDir, title=False) this.clean() from polyglot import htmlCleaner cleaner = htmlCleaner( log=log, settings=settings, url= "http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", outputDirectory=pathToOutputDir, title="my_clean_doc.html") cleaner.clean()
def test_htmlCleaner_function_exception(self): from polyglot import htmlCleaner try: this = htmlCleaner( log=log, settings=settings, fakeKey="break the code" ) this.get() assert False except Exception, e: assert True print str(e)
def _print_original_webpage( self): """*print the original webpage* **Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.info('starting the ``_print_original_webpage`` method') if not self.title: from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document # include metadata in generated HTML (e.g. title), metadata=True, h1=True # include title as H1 at the top of the doc ) htmlFile = cleaner.clean() basename = os.path.basename(htmlFile) title = basename.replace(".html", "") os.remove(htmlFile) else: title = self.title # CONVERT TO PDF WITH ELECTON PDF url = self.url pdfPath = self.folderpath + "/" + title + self.append + ".pdf" electron = self.settings["executables"]["electron path"] cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) if len(stderr): print stderr exists = os.path.exists(pdfPath) if not exists: print "%(pdfPath)s was not generated for some reason - please investigate" % locals() sys.exit(0) self.log.info('completed the ``_print_original_webpage`` method') return pdfPath
def _print_parsed_webpage(self): """*print the parsed/cleaned webpage* **Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.debug('starting the ``_print_parsed_webpage()`` method') from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=True # include title as H1 at the top of the doc ) htmlFile = cleaner.clean() if not htmlFile: return pdfPath = htmlFile.replace(".html", self.append + ".pdf") # CONVERT TO PDF WITH ELECTON PDF electron = self.settings["executables"]["electron path"] cmd = """%(electron)s -i "%(htmlFile)s" -o "%(pdfPath)s" """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() if len(stderr): print stderr self.log.debug('output: %(stdout)s' % locals()) # REMOVE HTML FILE os.remove(htmlFile) exists = os.path.exists(pdfPath) if not exists: print "%(pdfPath)s was not generated for some reason - please investigate" % locals( ) sys.exit(0) self.log.debug('completed the ``_print_parsed_webpage()`` method') return pdfPath