Beispiel #1
0
 def convert2(self, doc, encoding, mimetype):
     # convert to unicode
     if not isinstance(doc, unicode):
         doc = unicode(doc, encoding, 'replace')
     doc = convert_entities(doc)
     result = self.convert(doc)
     # convert back to utf-8
     return result.encode('utf-8'), 'utf-8'
Beispiel #2
0
    def convert2(self, doc, encoding, mimetype):

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)
        
        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        return self.convert(doc), 'utf-8'
Beispiel #3
0
    def convert(self, doc, encoding=None, mimetype=None,
                logError=False, raiseException=False):

        # convert to unicode
        if not isinstance(doc, unicode):
            if not encoding:
                mo = charset_reg.search(doc)
                if mo is not None:
                    encoding = mo.group(1)
                else:
                    encoding = 'ascii' # guess
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = html2text(doc)

        # convert back to utf-8
        return result.encode('utf-8'), 'utf-8'
Beispiel #4
0
    def convert(self, doc, encoding, mimetype,
                logError=False, raiseException=False):

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding
        
        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return str(p), 'utf-8'
Beispiel #5
0
    def convert(self,
                doc,
                encoding=None,
                mimetype=None,
                logError=False,
                raiseException=False):

        # convert to unicode
        if not isinstance(doc, unicode):
            if not encoding:
                mo = charset_reg.search(doc)
                if mo is not None:
                    encoding = mo.group(1)
                else:
                    encoding = 'ascii'  # guess
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = html2text(doc)

        # convert back to utf-8
        return result.encode('utf-8'), 'utf-8'
Beispiel #6
0
    def convert(self,
                doc,
                encoding,
                mimetype,
                logError=False,
                raiseException=False):

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding

        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return str(p), 'utf-8'