Beispiel #1
def getUnicodeHTMLForFile(filename):
    html = open(filename, "rb").read()
    encoding = GetEncoding(html)
    if not encoding:
        encoding = ""
    return utils.makeUnicode(html, encoding)
Beispiel #2
 def normalize(name):
     Normalize a page name's case, spaces, and such, but do not
     encode any special characters--this must be a repeatable transform,
     having no effect on a name already normalized.  This is done to
     names that should already be mangled for use as URLs so that they
     will map to the same database key and such.  The mangle function
     below should be used on human-readable names.
     return removeEscapes(makeUnicode(name).lstrip().rstrip().lower().replace(u" ", u"_"))
Beispiel #3
 def __setitem__(self, key, val):
     if "class" == key:
         self.classes = val.split(u" ")
     elif "style" == key:
         styles = val.split(u";")
         for s in styles:
             if not s:
             spl = s.split(":", 1)
             if 1 == len(spl):
             self.styles[spl[0].lstrip().rstrip()] = \
     else:[key] = makeUnicode(val)
Beispiel #4
def getPrefix(line):
    If line begins with a colon-separated namespace prefix, return it
    and remove it for further processing.
    >>> from namespaces import getPrefix
    >>> getPrefix("")
    (u'', u'')
    >>> getPrefix(":")
    (u'', u'')
    >>> getPrefix(":abc")
    (u'', u'abc')
    >>> getPrefix("::abc")
    (u'', u':abc')
    >>> getPrefix("abc")
    (u'', u'abc')
    >>> getPrefix("abc:")
    (u'abc', u'')
    >>> getPrefix("abc:def")
    (u'abc', u'def')
    >>> getPrefix("abc:def:ghi")
    (u'abc', u'def:ghi')
    >>> getPrefix("abc::")
    (u'abc', u':')
    line = makeUnicode(line.lstrip())

    if not line:
        return u"", line
    if u":" == line[0]:
        return u"", line[1:]
    if not line[0].isalpha():
        return u"", line

    ns = [line[0]]
    i = 1
    while True:
        if i >= len(line):
            return u"", line
        if u":" == line[i]:
            return u"".join(ns), line[i+1:]
        elif (line[i].isalnum()) or (line[i] in u"-_"):
            return u"", line
        i += 1
Beispiel #5
    def demangle(name):
        Given a name mangled as above, render it in a more readable form.

        >>> from utils import makeUnicode, removeEscapes
        >>> import namespaces
        >>> namespaces.Local.demangle("a_page_title")
        u'A page title'
        >>> namespaces.Local.demangle("2$3a_a_10$25_$245_b_c")
        u'2: a 10% $5 b c'
        v = list(makeUnicode(name))
        for i in xrange(len(name)):
            if v[i] == u"_":
                v[i] = u" "
            elif v[i] == u"$" and i < len(name)-2:
                v[i] = unichr(16 * int(v[i+1], 16) + int(v[i+2], 16))
                v[i+1] = u"\u0000"
                v[i+2] = u"\u0000"
        s = u"".join(v)
        s = removeEscapes(s)
        return s.capitalize()
Beispiel #6
def copyDependentFilesAndUpdateLinks(oldfile, filename):
    myanalyzer = analyzer.ContentAnalyzer()
    htmldir = os.path.dirname(oldfile)
    html = utils.openFile(filename, "r").read()
    encoding = GetEncoding(html)
    if encoding == None:
        encoding = utils.getCurrentEncoding()
    html = utils.makeUnicode(html, encoding)
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        html = convNotSoSmartQuotesToHtmlEntity(html)
    for link in myanalyzer.fileLinks:
        sourcefile = GetFullPathForURL(link, htmldir)
        if os.path.exists(sourcefile):
            sourcedir = os.path.dirname(sourcefile)
            htmlname = os.path.basename(filename)
            depName = os.path.basename(link)
            destLink = u"../File/" + htmlname + "_files/" + depName
            destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep)))
            if not os.path.exists(destdir):
            result = fileutils.CopyFile(depName, sourcedir, destdir)
            if result:
                html = html.replace(link, urllib.quote(destLink))
                print "unable to copy file: " + sourcefile
            print "cannot find source file: " + sourcefile
    output = utils.openFile(filename, "w")
Beispiel #7
 def __setattr__(self, name, value):
     # make sure internally we're always using Unicode
     if not name == "encoding":
         self.__dict__[name] = utils.makeUnicode(value, self.encoding)
         self.__dict__[name] = value 
Beispiel #8
 def _setvalue(self, text):
     self._value = makeUnicode(text)
Beispiel #9
 def __init__(self, parent=None, val=u""):
     if self.__class__ is CharacterData:
         raise NotImplementedError
     Node.__init__(self, parent)
     self._value = makeUnicode(val)
Beispiel #10
 def imageURL(self, title):
     return makeUnicode(config.localImagePattern) % Local.mangle(title)
Beispiel #11
 def linkURL(self, title):
     return makeUnicode(config.localLinkPattern) % Local.mangle(title)
Beispiel #12
def GetBody(myhtml):
    Function: _GetBody(self, myhtml)
    Last Updated: 9/24/02
    Description: Internal function to get the data in between the <BODY></BODY> tags.

    - myhtml: a string containing the HTML page

    Return values:
    Returns the data between the <BODY></BODY> tags of the HTML page
    inbody = 0
    inscript = 0
    bodystart = 0
    bodyend = 0
    text = ""
    uppercase = 1
    encoding = None
    htmltext = myhtml.readlines()
    for html in htmltext:
        if not encoding and string.find(html.lower(), "<meta") != -1:
            encoding = GetEncoding(html)
        #if we're inside a script, mark it so that we can test if body tag is inside the script
        scriptstart = string.find(html, "<SCRIPT")
        if scriptstart == -1:
            scriptstart = string.find(html, "<script")

        if not string.find(html.lower(), "</script>") == -1:
            inscript = 0

        #check for start of body in upper and lowercase
        bodystart = string.find(string.lower(html), "<body")

        #if body is found, mark the end of it
        if not bodystart == -1:
            bodystart = string.find(html, ">", bodystart)

        #if we've found both a body tag and a script tag, find which one comes first
        #if script is first, this isn't the "real" body tag
        if (not inbody and bodystart != -1) and scriptstart != -1:
            if bodystart > scriptstart:
                inscript = 1

        #if we are not in a script, and we've found the body tag, capture the text
        if inscript == 0 and (not bodystart == -1 or inbody):
            inbody = 1
            bodyend = string.find(string.lower(html), "</body>")
            #if both <BODY> and </BODY> are on same line, grab it all
            if not bodystart == -1 and not bodyend == -1:
                text = text + html[bodystart+1:bodyend]
                bodystart = -1
                bodyend = -1
                inbody = 0
            elif not bodyend == -1:
                #if bodyend == 0:
                #   bodyend = 1 #a hack because -1 means everything
                inbody = 0
                text = text + html[0:bodyend] 
                bodyend = -1
            elif not bodystart == -1:
                text = text + html[bodystart+1:-1] 
                bodystart = -1
            elif inbody == 1:
                text = text + html
        html = myhtml.readline()
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        text = convNotSoSmartQuotesToHtmlEntity(text)
    text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace')
    soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext))
    if soup.html.head:
        scripts = soup.html.head.findAll('script')
        scripts.reverse() # since we're prepending, we need to do it in reverse order
        for script in scripts:
            text = script + text
    return text