コード例 #1
0
ファイル: htmlutils.py プロジェクト: kollivier/brightwriter
def getUnicodeHTMLForFile(filename):
    html = open(filename, "rb").read()
    encoding = GetEncoding(html)
    if not encoding:
        encoding = ""
        
    return utils.makeUnicode(html, encoding)
コード例 #2
0
 def normalize(name):
     """
     Normalize a page name's case, spaces, and such, but do not
     encode any special characters--this must be a repeatable transform,
     having no effect on a name already normalized.  This is done to
     names that should already be mangled for use as URLs so that they
     will map to the same database key and such.  The mangle function
     below should be used on human-readable names.
     """
     return removeEscapes(makeUnicode(name).lstrip().rstrip().lower().replace(u" ", u"_"))
コード例 #3
0
ファイル: dom.py プロジェクト: lcrocker/ewc
 def __setitem__(self, key, val):
     if "class" == key:
         self.classes = val.split(u" ")
     elif "style" == key:
         styles = val.split(u";")
         for s in styles:
             if not s:
                 continue
             spl = s.split(":", 1)
             if 1 == len(spl):
                 spl.append("")
             self.styles[spl[0].lstrip().rstrip()] = \
             (spl[1].lstrip().rstrip()).decode()
     else:
         self.map[key] = makeUnicode(val)
コード例 #4
0
def getPrefix(line):
    """
    If line begins with a colon-separated namespace prefix, return it
    and remove it for further processing.
    
    >>> from namespaces import getPrefix
    >>> getPrefix("")
    (u'', u'')
    >>> getPrefix(":")
    (u'', u'')
    >>> getPrefix(":abc")
    (u'', u'abc')
    >>> getPrefix("::abc")
    (u'', u':abc')
    >>> getPrefix("abc")
    (u'', u'abc')
    >>> getPrefix("abc:")
    (u'abc', u'')
    >>> getPrefix("abc:def")
    (u'abc', u'def')
    >>> getPrefix("abc:def:ghi")
    (u'abc', u'def:ghi')
    >>> getPrefix("abc::")
    (u'abc', u':')
    """
    line = makeUnicode(line.lstrip())

    if not line:
        return u"", line
    if u":" == line[0]:
        return u"", line[1:]
    if not line[0].isalpha():
        return u"", line

    ns = [line[0]]
    i = 1
    while True:
        if i >= len(line):
            return u"", line
        if u":" == line[i]:
            return u"".join(ns), line[i+1:]
        elif (line[i].isalnum()) or (line[i] in u"-_"):
            ns.append(line[i])
        else:
            return u"", line
        i += 1
コード例 #5
0
    def demangle(name):
        """
        Given a name mangled as above, render it in a more readable form.

        >>> from utils import makeUnicode, removeEscapes
        >>> import namespaces
        >>> namespaces.Local.demangle("a_page_title")
        u'A page title'
        >>> namespaces.Local.demangle("2$3a_a_10$25_$245_b_c")
        u'2: a 10% $5 b c'
        """
        v = list(makeUnicode(name))
        for i in xrange(len(name)):
            if v[i] == u"_":
                v[i] = u" "
            elif v[i] == u"$" and i < len(name)-2:
                v[i] = unichr(16 * int(v[i+1], 16) + int(v[i+2], 16))
                v[i+1] = u"\u0000"
                v[i+2] = u"\u0000"
        s = u"".join(v)
        s = removeEscapes(s)
        return s.capitalize()
コード例 #6
0
ファイル: htmlutils.py プロジェクト: kollivier/brightwriter
def copyDependentFilesAndUpdateLinks(oldfile, filename):
    myanalyzer = analyzer.ContentAnalyzer()
    myanalyzer.analyzeFile(filename)
    htmldir = os.path.dirname(oldfile)
    html = utils.openFile(filename, "r").read()
    encoding = GetEncoding(html)
    if encoding == None:
        encoding = utils.getCurrentEncoding()
        
    html = utils.makeUnicode(html, encoding)
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        html = convNotSoSmartQuotesToHtmlEntity(html)
    
    for link in myanalyzer.fileLinks:
        sourcefile = GetFullPathForURL(link, htmldir)
        
        if os.path.exists(sourcefile):
            sourcedir = os.path.dirname(sourcefile)
            htmlname = os.path.basename(filename)
            depName = os.path.basename(link)
            destLink = u"../File/" + htmlname + "_files/" + depName
            destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep)))
            if not os.path.exists(destdir):
                os.makedirs(destdir)
            result = fileutils.CopyFile(depName, sourcedir, destdir)
            if result:
                html = html.replace(link, urllib.quote(destLink))
            else:
                print "unable to copy file: " + sourcefile
        else:
            print "cannot find source file: " + sourcefile
                
    output = utils.openFile(filename, "w")
    output.write(html.encode(encoding))
    output.close()
コード例 #7
0
ファイル: conman.py プロジェクト: kollivier/brightwriter
 def __setattr__(self, name, value):
     # make sure internally we're always using Unicode
     if not name == "encoding":
         self.__dict__[name] = utils.makeUnicode(value, self.encoding)
     else:
         self.__dict__[name] = value 
コード例 #8
0
ファイル: dom.py プロジェクト: lcrocker/ewc
 def _setvalue(self, text):
     self._value = makeUnicode(text)
コード例 #9
0
ファイル: dom.py プロジェクト: lcrocker/ewc
 def __init__(self, parent=None, val=u""):
     if self.__class__ is CharacterData:
         raise NotImplementedError
     Node.__init__(self, parent)
     self._value = makeUnicode(val)
コード例 #10
0
 def imageURL(self, title):
     return makeUnicode(config.localImagePattern) % Local.mangle(title)
コード例 #11
0
 def linkURL(self, title):
     return makeUnicode(config.localLinkPattern) % Local.mangle(title)
コード例 #12
0
ファイル: htmlutils.py プロジェクト: kollivier/brightwriter
def GetBody(myhtml):
    """
    Function: _GetBody(self, myhtml)
    Last Updated: 9/24/02
    Description: Internal function to get the data in between the <BODY></BODY> tags.

    Arguments:
    - myhtml: a string containing the HTML page

    Return values:
    Returns the data between the <BODY></BODY> tags of the HTML page
            """
    inbody = 0
    inscript = 0
    bodystart = 0
    bodyend = 0
    text = ""
    uppercase = 1
    encoding = None
    htmltext = myhtml.readlines()
    for html in htmltext:
        if not encoding and string.find(html.lower(), "<meta") != -1:
            encoding = GetEncoding(html)
        #if we're inside a script, mark it so that we can test if body tag is inside the script
        scriptstart = string.find(html, "<SCRIPT")
        if scriptstart == -1:
            scriptstart = string.find(html, "<script")

        if not string.find(html.lower(), "</script>") == -1:
            inscript = 0

        #check for start of body in upper and lowercase
        bodystart = string.find(string.lower(html), "<body")

        #if body is found, mark the end of it
        if not bodystart == -1:
            bodystart = string.find(html, ">", bodystart)

        #if we've found both a body tag and a script tag, find which one comes first
        #if script is first, this isn't the "real" body tag
        if (not inbody and bodystart != -1) and scriptstart != -1:
            if bodystart > scriptstart:
                inscript = 1

        #if we are not in a script, and we've found the body tag, capture the text
        if inscript == 0 and (not bodystart == -1 or inbody):
            inbody = 1
            bodyend = string.find(string.lower(html), "</body>")
                
            #if both <BODY> and </BODY> are on same line, grab it all
            if not bodystart == -1 and not bodyend == -1:
                text = text + html[bodystart+1:bodyend]
                bodystart = -1
                bodyend = -1
                inbody = 0
            elif not bodyend == -1:
                #if bodyend == 0:
                #   bodyend = 1 #a hack because -1 means everything
                inbody = 0
                text = text + html[0:bodyend] 
                bodyend = -1
            elif not bodystart == -1:
                text = text + html[bodystart+1:-1] 
                bodystart = -1
            elif inbody == 1:
                text = text + html
        html = myhtml.readline()
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        text = convNotSoSmartQuotesToHtmlEntity(text)
    
    text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace')
    
    soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext))
    if soup.html.head:
        scripts = soup.html.head.findAll('script')
        scripts.reverse() # since we're prepending, we need to do it in reverse order
        for script in scripts:
            text = script + text
    
    return text