Beispiel #1
0
def convert(sTextileDir, sHtmlDir, cAppInfo, aPlugins, fProcessText):
    """Convert all .txt files in sTextileDir to .html files in sHtmlDir."""
    # pylint: disable=too-many-locals
    # Reducing the number of variables won't help clarity
    for sTextilePath in glob.glob(os.path.join(sTextileDir, "*.txt")):
        sBasename = os.path.basename(sTextilePath)
        sFilename, _sExt = os.path.splitext(sBasename)
        sHtmlPath = os.path.join(sHtmlDir, sFilename + ".html")

        dContext = {
            'title': "%s %s" % (cAppInfo.NAME, sFilename.replace('_', ' ')),
        }

        fHtml = open(sHtmlPath, "w")

        aLines = _load_textile(sTextilePath)

        aLines = _process_plugins(aLines, aPlugins)

        fHtml.write(textile2html('\n'.join(aLines), dContext, fProcessText))

        fHtml.close()

        if tidy is not None:
            aErrors = tidy.parse(sHtmlPath).get_errors()
            if aErrors:
                print('tidy reports the following errors for %s' % sHtmlPath)
                print('\n'.join([x.err for x in aErrors]))
 def __tidy(self, htmlFile):
     import tidy
     tidyOpts = dict(output_xhtml = 1, add_xml_decl = 1, indent = 1)
     tidyHtml = str(tidy.parse(htmlFile, **tidyOpts))
     for key, value in self.__unicodeDict.iteritems():
         if tidyHtml.find(key):
             tidyHtml = tidyHtml.replace(key, value)
     return tidyHtml
 def __tidy(self, htmlFile):
     import tidy
     tidyOpts = dict(output_xhtml=1, add_xml_decl=1, indent=1)
     tidyHtml = str(tidy.parse(htmlFile, **tidyOpts))
     for key, value in self.__unicodeDict.iteritems():
         if tidyHtml.find(key):
             tidyHtml = tidyHtml.replace(key, value)
     return tidyHtml
Beispiel #4
0
 def test_nonexisting(self):
     doc = tidy.parse(os.path.join(DATA_STORAGE, "missing.html"))
     self.assertEquals(str(doc).strip(), "")
     self.assertIn("missing.html", doc.errors[0].message)
     if doc.errors[0].severity == "E":
         self.assertEquals(doc.errors[0].severity, "E")
         self.assertTrue(str(doc.errors[0]).startswith("Error"))
     else:
         # Tidy 5.5.19 and newer
         self.assertEquals(doc.errors[0].severity, "D")
         self.assertTrue(str(doc.errors[0]).startswith("Document"))
Beispiel #5
0
 def test_nonexisting(self):
     doc = tidy.parse(os.path.join(DATA_STORAGE, "missing.html"))
     self.assertEquals(str(doc).strip(), "")
     self.assertIn("missing.html", doc.errors[0].message)
     if doc.errors[0].severity == "E":
         self.assertEquals(doc.errors[0].severity, "E")
         self.assertTrue(str(doc.errors[0]).startswith("Error"))
     else:
         # Tidy 5.5.19 and newer
         self.assertEquals(doc.errors[0].severity, "D")
         self.assertTrue(str(doc.errors[0]).startswith("Document"))
Beispiel #6
0
 def test_options(self):
     doc1 = tidy.parseString(
         self.input1, add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertIn("CDATA", str(doc1))
     doc2 = tidy.parseString(
         "<Html>", add_xml_decl=1, show_errors=1, newline="CR", output_xhtml=1
     )
     self.assertTrue(str(doc2).startswith("<?xml"))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn("\n", str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
     self.assertIn('alt="foo"', doc3.gettext())
     self.assertIn("é", doc3.gettext())
Beispiel #7
0
    def test_options(self):
        options = dict(add_xml_decl=1, show_errors=1, newline='CR', 
                       output_xhtml=1)
        doc1 = tidy.parseString(self.input1, **options)
        found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1),
                          re.MULTILINE)
        self.failUnless(found)
        doc2 = tidy.parseString("<Html>", **options)
        self.failUnless(str(doc2).startswith('<?xml'))
##        self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't
##                                        # support this?
        self.failUnless(str(doc2).find('\n')<0)
        doc3 = tidy.parse('foo.htm', char_encoding='utf8', 
                          alt_text='foo')
        self.failUnless(str(doc3).find('alt="foo"')>=0)
        self.failUnless(str(doc3).find('\xc3\xa9')>=0)
Beispiel #8
0
def convert(sTextileDir, sHtmlDir):
    """Convert all .txt files in sTextileDir to .html files in sHtmlDir."""
    for sTextilePath in glob.glob(os.path.join(sTextileDir, "*.txt")):
        sBasename = os.path.basename(sTextilePath)
        sFilename, _sExt = os.path.splitext(sBasename)
        sHtmlPath = os.path.join(sHtmlDir, sFilename + ".html")

        dContext = {
            'title': "Sutekh " + sFilename.replace('_', ' '),
        }

        fTextile = file(sTextilePath, "rb")
        fHtml = file(sHtmlPath, "wb")

        # NB: Late night fast 'n dirty hack alert
        # Annoyingly, python-textile 2.1 doesn't handle list elements split
        # over multiple lines the way python-textile 2.0 does [1], so we need
        # to manually join lines before feeding them to textile
        # We use the tradional trailing \ to indicate continuation
        # [1] Note that the 2.10 version in karmic is a misnumbered 2.0.10
        aLines = []
        aCurLine = []
        for sLine in fTextile.readlines():
            if sLine.endswith("\\\n"):
                if aCurLine:
                    aCurLine.append(sLine[:-2])
                else:
                    aCurLine = [sLine[:-2]]
                continue
            elif aCurLine:
                aCurLine.append(sLine)
                aLines.append(''.join(aCurLine))
                aCurLine = []
            else:
                aLines.append(sLine)

        fHtml.write(textile2html(''.join(aLines), dContext))

        fTextile.close()
        fHtml.close()

        if tidy is not None:
            aErrors = tidy.parse(sHtmlPath).get_errors()
            if aErrors:
                print 'tidy reports the following errors for %s' % sHtmlPath
                print '\n'.join([x.err for x in aErrors])
Beispiel #9
0
 def test_options(self):
     options = dict(add_xml_decl=1,
                    show_errors=1,
                    newline='CR',
                    output_xhtml=1)
     doc1 = tidy.parseString(self.input1, **options)
     found = re.search('//<![[]CDATA[[]\W+1>2\W+//]]>', str(doc1),
                       re.MULTILINE)
     self.failUnless(found)
     doc2 = tidy.parseString("<Html>", **options)
     self.failUnless(str(doc2).startswith('<?xml'))
     ##        self.failIf(len(doc2.errors)>1) # FIXME - tidylib doesn't
     ##                                        # support this?
     self.failUnless(str(doc2).find('\n') < 0)
     doc3 = tidy.parse('foo.htm', char_encoding='utf8', alt_text='foo')
     self.failUnless(str(doc3).find('alt="foo"') >= 0)
     self.failUnless(str(doc3).find('\xc3\xa9') >= 0)
Beispiel #10
0
 def test_options(self):
     doc1 = tidy.parseString(self.input1,
                             add_xml_decl=1,
                             show_errors=1,
                             newline="CR",
                             output_xhtml=1)
     self.assertIn("CDATA", str(doc1))
     doc2 = tidy.parseString("<Html>",
                             add_xml_decl=1,
                             show_errors=1,
                             newline="CR",
                             output_xhtml=1)
     self.assertTrue(str(doc2).startswith("<?xml"))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn("\n", str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding="utf8", alt_text="foo")
     self.assertIn('alt="foo"', doc3.gettext())
     self.assertIn("é", doc3.gettext())
Beispiel #11
0
 def getloginpage(self, session, argdict, languagenames=None, **kwargs):
   if languagenames is None:
     languagenames = self.languagenames
   template = getattr(session.instance,'logintemplate',None)
   if template:
       import kid
       title = getattr(session.instance,'title',getattr(session.instance,"__name__",str(session.instance)))
       context = {'languagenames':languagenames,
                  'extraargs':argdict,
                  'title':title,
                  'session':session}
       try:
           import tidy
           tidyoptions = dict(output_xhtml=1,add_xml_decl=1,indent=1,tidy_mark=0)
           source = str(tidy.parse(template,**tidyoptions))
       except:
           source = open(template, "r").read()
       serializer = kid.Template(source=source,**context)
       html = serializer.serialize(output="xhtml")
       return widgets.PlainContents(html)
   return self.loginpageclass(session, argdict, languagenames=languagenames, **kwargs)
Beispiel #12
0
 def test_options(self):
     doc1 = tidy.parseString(
         self.input1,
         add_xml_decl=1,
         show_errors=1,
         newline='CR',
         output_xhtml=1,
     )
     self.assertIn('CDATA', str(doc1))
     doc2 = tidy.parseString(
         "<Html>",
         add_xml_decl=1,
         show_errors=1,
         newline='CR',
         output_xhtml=1,
     )
     self.assertTrue(str(doc2).startswith('<?xml'))
     self.assertFalse(len(doc2.errors) == 0)
     self.assertNotIn('\n', str(doc2))
     doc3 = tidy.parse(self.test_file, char_encoding='utf8', alt_text='foo')
     self.assertIn(b'alt="foo"', str(doc3))
     self.assertIn(b'\xc3\xa9', str(doc3))
    def analyze_tidy(self, filename):
        """
        DESCRIPTION:
            
        PARAMETERS:
        
        RETURN: 
        """
        encoding_tidy = None

        try:
            analyzer_unicode = AnalyzerUnicode.AnalyzerUnicode(
                self.path_info, "file")
            encoding = analyzer_unicode.analyze(filename)["encoding"]
        except NotImplementedError:
            print "[FATAL] Analyzer not implemented"
            sys.exit(1)

        # set file encoding to prevent false positives
        if encoding == "utf-8":
            encoding_tidy = encoding.replace("-", "")
        if encoding == "iso-8859-1":
            encoding_tidy = "latin1"
        if encoding == "us-ascii":
            encoding_tidy = "latin1"
        options = {"input-encoding": encoding_tidy}

        try:
            document = tidy.parse(filename, **options)
            #document, error = tidy_document(open(filename).read(), options)

        except tidy.TidyLibError as err:
            print "could not read file content, check encoding:",
            print os.path.basename(filename),
            print err
            sys.exit(1)

        return document.get_errors()
    def analyze_tidy(self, filename):
        """
        DESCRIPTION:
            
        PARAMETERS:
        
        RETURN: 
        """
        encoding_tidy = None
        
        try:
            analyzer_unicode = AnalyzerUnicode.AnalyzerUnicode(self.path_info, "file")
            encoding = analyzer_unicode.analyze(filename)["encoding"]
        except NotImplementedError:
            print "[FATAL] Analyzer not implemented"
            sys.exit(1)

        # set file encoding to prevent false positives
        if encoding == "utf-8":
            encoding_tidy = encoding.replace("-","")
        if encoding == "iso-8859-1":
            encoding_tidy = "latin1"
        if encoding == "us-ascii":
            encoding_tidy = "latin1"
        options = {"input-encoding" : encoding_tidy}
        
        try:
            document = tidy.parse(filename, **options)
            #document, error = tidy_document(open(filename).read(), options)

        except tidy.TidyLibError as err: 
            print "could not read file content, check encoding:",
            print os.path.basename(filename),
            print err
            sys.exit(1)
        
        return document.get_errors()
Beispiel #15
0
    def process(self, file_name, table_index=19):
        reader = t.HtmlReader()
        tidied = tidy.parse(file_name)
        from StringIO import StringIO
        tidied = StringIO(tidied)

        out = reader.read(tidied, table_index).data
        # print len(reader.tables)
        # print reader.tables[22]
        out = out[4:]
        out[-1] = out[-1][:2]
        # strip out unnecessary stuff
        out = [ [ x[0].replace(' -\nGlobal', ''), float(x[1][:-1]) ] for x in out ]
        out = dict(out)
        # consolidate microsoft results into one value
        def consolidate_row(row):
            # always have MSN but not always Live
            mslive = 'Microsoft Live\nSearch'
            row['MSN'] = row['MSN'] + row.get(mslive, 0)
            if row.has_key(mslive):
                del row[mslive]
            return row
        out = consolidate_row(out)
        return out
Beispiel #16
0
 def test_nonexisting(self):
     doc = tidy.parse(os.path.join(DATA_STORAGE, 'missing.html'))
     self.assertEquals(str(doc), '')
     self.assertTrue('missing.html' in doc.errors[0].message)
     self.assertEquals(doc.errors[0].severity, 'E')
     self.assertTrue(str(doc.errors[0]).startswith('Error'))
Beispiel #17
0
 def default_docs(self):
     doc1 = tidy.parseString(self.input1)
     doc2 = tidy.parseString(self.input2)
     doc3 = tidy.parse(self.test_file, char_encoding='ascii')
     return (doc1, doc2, doc3)
Beispiel #18
0

outroot = ElementTree.Element(
    "routes", {
        "route": "tram",
        "network": "local",
        "operator": u"Zarząd Transportu Miejskiego w Warszawie",
        "way": "railway=tram",
        "stop": "railway=tram_stop",
        "onlywholeway": "yes",
        "notightturns": "yes"
    })
outtree = ElementTree.ElementTree(outroot)

for arg in sys.argv[1:]:
    doc = tidy.parse(arg + "/TRASY.HTM", **tidyopts)
    links = [x for x in parselinks(doc) if x[0].startswith("T")]
    if len(links) != 2 and len(links) != 1:
        sys.stderr.write(arg + "/TRASY.HTM has " + str(len(links)) +
                         " links!\n")
        sys.exit(-5)

    stops = []
    firststops = []
    firstp = 0
    lastp = 0
    for file, subroute in links:
        stoplinks = stopsparse(arg + "/" + file)
        newstops = []
        for subfile, stop in stoplinks:
            name = stop.replace(".", ". ").split()
Beispiel #19
0
 def defaultDocs(self):
     doc1 = tidy.parseString(self.input1)
     doc2 = tidy.parseString(self.input2)
     doc3 = tidy.parse("foo.htm")
     doc4 = tidy.parse("bar.htm")  # doesn't exist
     return (doc1, doc2, doc3, doc4)
        ret.append(("", line[p:e]))
    f.close()
    return ret

outroot = ElementTree.Element("routes", {
    "route": "tram",
    "network": "local",
    "operator": u"Zarząd Transportu Miejskiego w Warszawie",
    "way": "railway=tram",
    "stop": "railway=tram_stop",
    "onlywholeway": "yes",
    "notightturns": "yes" })
outtree = ElementTree.ElementTree(outroot)

for arg in sys.argv[1:]:
    doc = tidy.parse(arg + "/TRASY.HTM", **tidyopts)
    links = [ x for x in parselinks(doc) if x[0].startswith("T") ]
    if len(links) != 2 and len(links) != 1:
        sys.stderr.write(arg + "/TRASY.HTM has " +
                str(len(links)) + " links!\n")
        sys.exit(-5)

    stops = []
    firststops = []
    firstp = 0
    lastp = 0
    for file, subroute in links:
        stoplinks = stopsparse(arg + "/" + file)
        newstops = []
        for subfile, stop in stoplinks:
            name = stop.replace(".", ". ").split()
Beispiel #21
0
 def default_docs(self):
     doc1 = tidy.parseString(self.input1)
     doc2 = tidy.parseString(self.input2)
     doc3 = tidy.parse(self.test_file, char_encoding="ascii")
     return (doc1, doc2, doc3)
Beispiel #22
0
 def defaultDocs(self):
     doc1 = tidy.parseString(self.input1)
     doc2 = tidy.parseString(self.input2)
     doc3 = tidy.parse("foo.htm")
     doc4 = tidy.parse("bar.htm") # doesn't exist
     return (doc1, doc2, doc3, doc4)