def convert(sTextileDir, sHtmlDir, cAppInfo, aPlugins, fProcessText): """Convert all .txt files in sTextileDir to .html files in sHtmlDir.""" # pylint: disable=too-many-locals # Reducing the number of variables won't help clarity for sTextilePath in glob.glob(os.path.join(sTextileDir, "*.txt")): sBasename = os.path.basename(sTextilePath) sFilename, _sExt = os.path.splitext(sBasename) sHtmlPath = os.path.join(sHtmlDir, sFilename + ".html") dContext = { 'title': "%s %s" % (cAppInfo.NAME, sFilename.replace('_', ' ')), } fHtml = open(sHtmlPath, "w") aLines = _load_textile(sTextilePath) aLines = _process_plugins(aLines, aPlugins) fHtml.write(textile2html('\n'.join(aLines), dContext, fProcessText)) fHtml.close() if tidy is not None: aErrors = tidy.parse(sHtmlPath).get_errors() if aErrors: print('tidy reports the following errors for %s' % sHtmlPath) print('\n'.join([x.err for x in aErrors]))
def __tidy(self, htmlFile): import tidy tidyOpts = dict(output_xhtml = 1, add_xml_decl = 1, indent = 1) tidyHtml = str(tidy.parse(htmlFile, **tidyOpts)) for key, value in self.__unicodeDict.iteritems(): if tidyHtml.find(key): tidyHtml = tidyHtml.replace(key, value) return tidyHtml
def __tidy(self, htmlFile): import tidy tidyOpts = dict(output_xhtml=1, add_xml_decl=1, indent=1) tidyHtml = str(tidy.parse(htmlFile, **tidyOpts)) for key, value in self.__unicodeDict.iteritems(): if tidyHtml.find(key): tidyHtml = tidyHtml.replace(key, value) return tidyHtml
def test_nonexisting(self): doc = tidy.parse(os.path.join(DATA_STORAGE, "missing.html")) self.assertEquals(str(doc).strip(), "") self.assertIn("missing.html", doc.errors[0].message) if doc.errors[0].severity == "E": self.assertEquals(doc.errors[0].severity, "E") self.assertTrue(str(doc.errors[0]).startswith("Error")) else: # Tidy 5.5.19 and newer self.assertEquals(doc.errors[0].severity, "D") self.assertTrue(str(doc.errors[0]).startswith("Document"))
def test_options(self): doc1 = tidy.parseString( self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1 ) self.assertIn("CDATA", str(doc1)) doc2 = tidy.parseString( "<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1 ) self.assertTrue(str(doc2).startswith("<?xml")) self.assertFalse(len(doc2.errors) == 0) self.assertNotIn("\n", str(doc2)) doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo") self.assertIn('alt="foo"', doc3.gettext()) self.assertIn("é", doc3.gettext())
def test_options(self): options = dict(add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1) doc1 = tidy.parseString(self.input1, **options) found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1), re.MULTILINE) self.failUnless(found) doc2 = tidy.parseString("<Html>", **options) self.failUnless(str(doc2).startswith('<?xml')) ## self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't ## # support this? self.failUnless(str(doc2).find('\n')<0) doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo') self.failUnless(str(doc3).find('alt="foo"')>=0) self.failUnless(str(doc3).find('\xc3\xa9')>=0)
def convert(sTextileDir, sHtmlDir): """Convert all .txt files in sTextileDir to .html files in sHtmlDir.""" for sTextilePath in glob.glob(os.path.join(sTextileDir, "*.txt")): sBasename = os.path.basename(sTextilePath) sFilename, _sExt = os.path.splitext(sBasename) sHtmlPath = os.path.join(sHtmlDir, sFilename + ".html") dContext = { 'title': "Sutekh " + sFilename.replace('_', ' '), } fTextile = file(sTextilePath, "rb") fHtml = file(sHtmlPath, "wb") # NB: Late night fast 'n dirty hack alert # Annoyingly, python-textile 2.1 doesn't handle list elements split # over multiple lines the way python-textile 2.0 does [1], so we need # to manually join lines before feeding them to textile # We use the tradional trailing \ to indicate continuation # [1] Note that the 2.10 version in karmic is a misnumbered 2.0.10 aLines = [] aCurLine = [] for sLine in fTextile.readlines(): if sLine.endswith("\\\n"): if aCurLine: aCurLine.append(sLine[:-2]) else: aCurLine = [sLine[:-2]] continue elif aCurLine: aCurLine.append(sLine) aLines.append(''.join(aCurLine)) aCurLine = [] else: aLines.append(sLine) fHtml.write(textile2html(''.join(aLines), dContext)) fTextile.close() fHtml.close() if tidy is not None: aErrors = tidy.parse(sHtmlPath).get_errors() if aErrors: print 'tidy reports the following errors for %s' % sHtmlPath print '\n'.join([x.err for x in aErrors])
def test_options(self): options = dict(add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1) doc1 = tidy.parseString(self.input1, **options) found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1), re.MULTILINE) self.failUnless(found) doc2 = tidy.parseString("<Html>", **options) self.failUnless(str(doc2).startswith('<?xml')) ## self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't ## # support this? self.failUnless(str(doc2).find('\n') < 0) doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo') self.failUnless(str(doc3).find('alt="foo"') >= 0) self.failUnless(str(doc3).find('\xc3\xa9') >= 0)
def test_options(self): doc1 = tidy.parseString(self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1) self.assertIn("CDATA", str(doc1)) doc2 = tidy.parseString("<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1) self.assertTrue(str(doc2).startswith("<?xml")) self.assertFalse(len(doc2.errors) == 0) self.assertNotIn("\n", str(doc2)) doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo") self.assertIn('alt="foo"', doc3.gettext()) self.assertIn("é", doc3.gettext())
def getloginpage(self, session, argdict, languagenames=None, **kwargs): if languagenames is None: languagenames = self.languagenames template = getattr(session.instance,'logintemplate',None) if template: import kid title = getattr(session.instance,'title',getattr(session.instance,"__name__",str(session.instance))) context = {'languagenames':languagenames, 'extraargs':argdict, 'title':title, 'session':session} try: import tidy tidyoptions = dict(output_xhtml=1,add_xml_decl=1,indent=1,tidy_mark=0) source = str(tidy.parse(template,**tidyoptions)) except: source = open(template, "r").read() serializer = kid.Template(source=source,**context) html = serializer.serialize(output="xhtml") return widgets.PlainContents(html) return self.loginpageclass(session, argdict, languagenames=languagenames, **kwargs)
def test_options(self): doc1 = tidy.parseString( self.input1, add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1, ) self.assertIn('CDATA', str(doc1)) doc2 = tidy.parseString( "<Html>", add_xml_decl=1, show_errors=1, newline='CR', output_xhtml=1, ) self.assertTrue(str(doc2).startswith('<?xml')) self.assertFalse(len(doc2.errors) == 0) self.assertNotIn('\n', str(doc2)) doc3 = tidy.parse(self.test_file, char_encoding='utf8', alt_text='foo') self.assertIn(b'alt="foo"', str(doc3)) self.assertIn(b'\xc3\xa9', str(doc3))
def analyze_tidy(self, filename): """ DESCRIPTION: PARAMETERS: RETURN: """ encoding_tidy = None try: analyzer_unicode = AnalyzerUnicode.AnalyzerUnicode( self.path_info, "file") encoding = analyzer_unicode.analyze(filename)["encoding"] except NotImplementedError: print "[FATAL] Analyzer not implemented" sys.exit(1) # set file encoding to prevent false positives if encoding == "utf-8": encoding_tidy = encoding.replace("-", "") if encoding == "iso-8859-1": encoding_tidy = "latin1" if encoding == "us-ascii": encoding_tidy = "latin1" options = {"input-encoding": encoding_tidy} try: document = tidy.parse(filename, **options) #document, error = tidy_document(open(filename).read(), options) except tidy.TidyLibError as err: print "could not read file content, check encoding:", print os.path.basename(filename), print err sys.exit(1) return document.get_errors()
def analyze_tidy(self, filename): """ DESCRIPTION: PARAMETERS: RETURN: """ encoding_tidy = None try: analyzer_unicode = AnalyzerUnicode.AnalyzerUnicode(self.path_info, "file") encoding = analyzer_unicode.analyze(filename)["encoding"] except NotImplementedError: print "[FATAL] Analyzer not implemented" sys.exit(1) # set file encoding to prevent false positives if encoding == "utf-8": encoding_tidy = encoding.replace("-","") if encoding == "iso-8859-1": encoding_tidy = "latin1" if encoding == "us-ascii": encoding_tidy = "latin1" options = {"input-encoding" : encoding_tidy} try: document = tidy.parse(filename, **options) #document, error = tidy_document(open(filename).read(), options) except tidy.TidyLibError as err: print "could not read file content, check encoding:", print os.path.basename(filename), print err sys.exit(1) return document.get_errors()
def process(self, file_name, table_index=19): reader = t.HtmlReader() tidied = tidy.parse(file_name) from StringIO import StringIO tidied = StringIO(tidied) out = reader.read(tidied, table_index).data # print len(reader.tables) # print reader.tables[22] out = out[4:] out[-1] = out[-1][:2] # strip out unnecessary stuff out = [ [ x[0].replace(' -\nGlobal', ''), float(x[1][:-1]) ] for x in out ] out = dict(out) # consolidate microsoft results into one value def consolidate_row(row): # always have MSN but not always Live mslive = 'Microsoft Live\nSearch' row['MSN'] = row['MSN'] + row.get(mslive, 0) if row.has_key(mslive): del row[mslive] return row out = consolidate_row(out) return out
def test_nonexisting(self): doc = tidy.parse(os.path.join(DATA_STORAGE, 'missing.html')) self.assertEquals(str(doc), '') self.assertTrue('missing.html' in doc.errors[0].message) self.assertEquals(doc.errors[0].severity, 'E') self.assertTrue(str(doc.errors[0]).startswith('Error'))
def default_docs(self): doc1 = tidy.parseString(self.input1) doc2 = tidy.parseString(self.input2) doc3 = tidy.parse(self.test_file, char_encoding='ascii') return (doc1, doc2, doc3)
outroot = ElementTree.Element( "routes", { "route": "tram", "network": "local", "operator": u"Zarząd Transportu Miejskiego w Warszawie", "way": "railway=tram", "stop": "railway=tram_stop", "onlywholeway": "yes", "notightturns": "yes" }) outtree = ElementTree.ElementTree(outroot) for arg in sys.argv[1:]: doc = tidy.parse(arg + "/TRASY.HTM", **tidyopts) links = [x for x in parselinks(doc) if x[0].startswith("T")] if len(links) != 2 and len(links) != 1: sys.stderr.write(arg + "/TRASY.HTM has " + str(len(links)) + " links!\n") sys.exit(-5) stops = [] firststops = [] firstp = 0 lastp = 0 for file, subroute in links: stoplinks = stopsparse(arg + "/" + file) newstops = [] for subfile, stop in stoplinks: name = stop.replace(".", ". ").split()
def defaultDocs(self): doc1 = tidy.parseString(self.input1) doc2 = tidy.parseString(self.input2) doc3 = tidy.parse("foo.htm") doc4 = tidy.parse("bar.htm") # doesn't exist return (doc1, doc2, doc3, doc4)
ret.append(("", line[p:e])) f.close() return ret outroot = ElementTree.Element("routes", { "route": "tram", "network": "local", "operator": u"Zarząd Transportu Miejskiego w Warszawie", "way": "railway=tram", "stop": "railway=tram_stop", "onlywholeway": "yes", "notightturns": "yes" }) outtree = ElementTree.ElementTree(outroot) for arg in sys.argv[1:]: doc = tidy.parse(arg + "/TRASY.HTM", **tidyopts) links = [ x for x in parselinks(doc) if x[0].startswith("T") ] if len(links) != 2 and len(links) != 1: sys.stderr.write(arg + "/TRASY.HTM has " + str(len(links)) + " links!\n") sys.exit(-5) stops = [] firststops = [] firstp = 0 lastp = 0 for file, subroute in links: stoplinks = stopsparse(arg + "/" + file) newstops = [] for subfile, stop in stoplinks: name = stop.replace(".", ". ").split()
def default_docs(self): doc1 = tidy.parseString(self.input1) doc2 = tidy.parseString(self.input2) doc3 = tidy.parse(self.test_file, char_encoding="ascii") return (doc1, doc2, doc3)