Python convert_entities Exemples, karl.utilities.converters.entities.convert_entities Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : html.py Projet : reebalazs/karl

    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, "r").read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            encoding = mo.group(1)
        doc = unicode(doc, encoding, "replace")
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode("utf-8")), "utf-8"

Exemple #2

0

Afficher le fichier

    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            if mo:
                encoding = mo.group(1)
            else:
                encoding = 'UTF-8'  # UTF-8 is the new ASCII
        doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode('utf-8')), 'utf-8'

Exemple #3

0

Afficher le fichier

Fichier : html.py Projet : claytron/karl

    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            if mo:
                encoding = mo.group(1)
            else:
                encoding = 'UTF-8' # UTF-8 is the new ASCII
        doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = convert_entities(html2text(doc))

        # convert back to utf-8
        return StringIO.StringIO(result.encode('utf-8')), 'utf-8'

Exemple #4

0

Afficher le fichier

Fichier : sgml.py Projet : iotest3/new

    def convert(self, filename, encoding, mimetype):

        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding

        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return StringIO.StringIO(p), 'utf-8'

Exemple #5

0

Afficher le fichier

Fichier : sgml.py Projet : Falmarri/karl

    def convert(self, filename, encoding, mimetype):

        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # Use encoding from XML preamble if present
        mo = encoding_reg.search(doc)
        if mo:
            encoding = mo.group(1)

        if not encoding:
            encoding = default_encoding
        
        if not isinstance(doc, unicode):
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        doc = doc.encode('utf-8')
        p = StripTagParser()
        p.feed(doc)
        p.close()
        return StringIO.StringIO(p), 'utf-8'

Exemple #6

0

Afficher le fichier

def extract_text_from_html(text):
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8', 'replace')
    return convert_entities(html2text(convert_entities(text))).strip()

Exemple #7

0

Afficher le fichier

Fichier : adapters.py Projet : Falmarri/karl

def extract_text_from_html(text):
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8', 'replace')
    return convert_entities(html2text(convert_entities(text))).strip()