Beispiel #1
0
 def go(self):
     soup = BeautifulSoup.BeautifulSoup(self.source,
                                        convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
                                        fromEncoding=self.encoding,
                                        smartQuotesTo=None)
     # Make sure the document content doesn't use multi-lines
     soup = self.format(soup)
     doc = document.Document()
     if self.css_source:
         self.css = CSS(self.css_source)
     else:
         self.css = CSS()    # empty css
     self.process_into(soup, doc)
     return doc
Beispiel #2
0
 def go(self):
     soup = BeautifulSoup.BeautifulSoup(
         self.source,
         convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
         fromEncoding=self.encoding,
         smartQuotesTo=None)
     # Make sure the document content doesn't use multi-lines
     soup = self.format(soup)
     doc = document.Document()
     if self.css_source:
         self.css = CSS(self.css_source)
     else:
         self.css = CSS()  # empty css
     self.process_into(soup, doc)
     return doc
Beispiel #3
0
class XHTMLReader(PythReader):

    @classmethod
    def read(self, source, css_source=None, encoding="utf-8"):
        reader = XHTMLReader(source, css_source, encoding)
        return reader.go()

    def __init__(self, source, css_source=None, encoding="utf-8"):
        self.source = source
        self.css_source = css_source
        self.encoding = encoding

    def go(self):
        # soup = BeautifulSoup.BeautifulSoup(self.source,
        #                                    convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
        #                                    fromEncoding=self.encoding,
        #                                    smartQuotesTo=None)
        ### Modified for bs4  ###
        soup = BeautifulSoup.BeautifulSoup(self.source, from_encoding=self.encoding)
        # Make sure the document content doesn't use multi-lines
        # soup = self.format(soup)
        doc = document.Document()
        if self.css_source:
            self.css = CSS(self.css_source)
        else:
            self.css = CSS()    # empty css
        self.process_into(soup, doc)
        return doc

    def format(self, soup):
        """format a BeautifulSoup document

        This will transform the block elements content from
        multi-lines text into single line.

        This allow us to avoid having to deal with further text
        rendering once this step has been done.
        """
        # Remove all the newline characters before a closing tag.
        for node in soup.findAll(text=True):
            if node.rstrip(" ").endswith("\n"):
                node.replaceWith(node.rstrip(" ").rstrip("\n"))
        # Join the block elements lines into a single long line
        for tag in ['p', 'li']:
            for node in soup.findAll(tag):
                text = unicode(node)
                lines = [x.strip() for x in text.splitlines()]
                text = ' '.join(lines)
                node.replaceWith(BeautifulSoup.BeautifulSoup(text))

        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        # replace all <br/> tag by newline character
        for node in soup.findAll('br'):
            node.replaceWith("\n")
        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        return soup

    def is_bold(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        bold.
        """
        return (node.findParent(['b', 'strong']) is not None or
                self.css.is_bold(node))

    def is_italic(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        italic.
        """
        return (node.findParent(['em', 'i']) is not None
                or self.css.is_italic(node))

    def is_sub(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        sub.
        """
        return (node.findParent(['sub']) is not None
                or self.css.is_sub(node))

    def is_super(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        super.
        """
        return (node.findParent(['sup']) is not None
                or self.css.is_super(node))

    def url(self, node):
        """
        return the url of a BeautifulSoup node or None if there is no
        url.
        """
        a_node = node.findParent('a')
        if not a_node:
            return None
        return a_node.get('href')

    def process_text(self, node):
        """
        Return a pyth Text object from a BeautifulSoup node or None if
        the text is empty.
        """
        text = node.string.strip()
        if not text:
            return

        # Set all the properties
        properties=dict()
        if self.is_bold(node):
            properties['bold'] = True
        if self.is_italic(node):
            properties['italic'] = True
        if self.url(node):
            properties['url'] = self.url(node)
        if self.is_sub(node):
            properties['sub'] = True
        if self.is_super(node):
            properties['super'] = True

        content=[node.string]

        return document.Text(properties, content)

    def process_into(self, node, obj):
        # if node.string is not None:
        #     print "node=%s"%node.name, "txt=%s"%node.string.encode('ascii', 'ignore')
        # else:
        #     print "node=%s"%node.name
        # print "obj=%s"%str(obj)
        # print
        """
        Process a BeautifulSoup node and fill its elements into a pyth
        base object.
        """
        if isinstance(node, BeautifulSoup.NavigableString):
            text = self.process_text(node)
            if text:
                obj.append(text)
            return
        oldobj = obj
        if node.name == 'p':
            # add a new paragraph into the pyth object
            new_obj = document.Paragraph()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == 'ul' or node.name == 'ol':
            # add a new list
            new_obj = document.List()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == 'li':
            # add a new list entry
            new_obj = document.ListEntry()
            obj.append(new_obj)
            obj = new_obj
        for child in node:
            # print "child=%s, node=%s"%(child.name,node.name)
            if node.name in ['p','li']:
                self.process_into(child, oldobj)
            else:
                self.process_into(child, obj)
Beispiel #4
0
class XHTMLReader(PythReader):
    @classmethod
    def read(self, source, css_source=None, encoding="utf-8"):
        reader = XHTMLReader(source, css_source, encoding)
        return reader.go()

    def __init__(self, source, css_source=None, encoding="utf-8"):
        self.source = source
        self.css_source = css_source
        self.encoding = encoding

    def go(self):
        soup = BeautifulSoup.BeautifulSoup(
            self.source,
            convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
            fromEncoding=self.encoding,
            smartQuotesTo=None)
        # Make sure the document content doesn't use multi-lines
        soup = self.format(soup)
        doc = document.Document()
        if self.css_source:
            self.css = CSS(self.css_source)
        else:
            self.css = CSS()  # empty css
        self.process_into(soup, doc)
        return doc

    def format(self, soup):
        """format a BeautifulSoup document

        This will transform the block elements content from
        multi-lines text into single line.

        This allow us to avoid having to deal with further text
        rendering once this step has been done.
        """
        # Remove all the newline characters before a closing tag.
        for node in soup.findAll(text=True):
            if node.rstrip(" ").endswith("\n"):
                node.replaceWith(node.rstrip(" ").rstrip("\n"))
        # Join the block elements lines into a single long line
        for tag in ['p', 'li']:
            for node in soup.findAll(tag):
                text = unicode(node)
                lines = [x.strip() for x in text.splitlines()]
                text = ' '.join(lines)
                node.replaceWith(text)
        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        # replace all <br/> tag by newline character
        for node in soup.findAll('br'):
            node.replaceWith("\n")
        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        return soup

    def is_bold(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        bold.
        """
        return (node.findParent(['b', 'strong']) is not None
                or self.css.is_bold(node))

    def is_italic(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        italic.
        """
        return (node.findParent(['em', 'i']) is not None
                or self.css.is_italic(node))

    def is_sub(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        sub.
        """
        return (node.findParent(['sub']) is not None or self.css.is_sub(node))

    def is_super(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        super.
        """
        return (node.findParent(['sup']) is not None
                or self.css.is_super(node))

    def url(self, node):
        """
        return the url of a BeautifulSoup node or None if there is no
        url.
        """
        a_node = node.findParent('a')
        if not a_node:
            return None
        return a_node.get('href')

    def process_text(self, node):
        """
        Return a pyth Text object from a BeautifulSoup node or None if
        the text is empty.
        """
        text = node.string.strip()
        if not text:
            return

        # Set all the properties
        properties = dict()
        if self.is_bold(node):
            properties['bold'] = True
        if self.is_italic(node):
            properties['italic'] = True
        if self.url(node):
            properties['url'] = self.url(node)
        if self.is_sub(node):
            properties['sub'] = True
        if self.is_super(node):
            properties['super'] = True

        content = [node.string]

        return document.Text(properties, content)

    def process_into(self, node, obj):
        """
        Process a BeautifulSoup node and fill its elements into a pyth
        base object.
        """
        if isinstance(node, BeautifulSoup.NavigableString):
            text = self.process_text(node)
            if text:
                obj.append(text)
            return
        if node.name == 'p':
            # add a new paragraph into the pyth object
            new_obj = document.Paragraph()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == 'ul':
            # add a new list
            new_obj = document.List()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == 'li':
            # add a new list entry
            new_obj = document.ListEntry()
            obj.append(new_obj)
            obj = new_obj
        for child in node:
            self.process_into(child, obj)
Beispiel #5
0
class XHTMLReader(PythReader):
    @classmethod
    def read(self, source, css_source=None, encoding="utf-8"):
        reader = XHTMLReader(source, css_source, encoding)
        return reader.go()

    def __init__(self, source, css_source=None, encoding="utf-8"):
        self.source = source
        self.css_source = css_source
        self.encoding = encoding

    def go(self):
        soup = BeautifulSoup.BeautifulSoup(
            self.source,
            convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
            fromEncoding=self.encoding,
            smartQuotesTo=None,
        )
        # Make sure the document content doesn't use multi-lines
        soup = self.format(soup)
        doc = document.Document()
        if self.css_source:
            self.css = CSS(self.css_source)
        else:
            self.css = CSS()  # empty css
        self.process_into(soup, doc)
        return doc

    def format(self, soup):
        """format a BeautifulSoup document

        This will transform the block elements content from
        multi-lines text into single line.

        This allow us to avoid having to deal with further text
        rendering once this step has been done.
        """
        # Remove all the newline characters before a closing tag.
        for node in soup.findAll(text=True):
            if node.rstrip(" ").endswith("\n"):
                node.replaceWith(node.rstrip(" ").rstrip("\n"))
        # Join the block elements lines into a single long line
        for tag in ["p", "li"]:
            for node in soup.findAll(tag):
                text = unicode(node)
                lines = [x.strip() for x in text.splitlines()]
                text = " ".join(lines)
                node.replaceWith(text)
        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        # replace all <br/> tag by newline character
        for node in soup.findAll("br"):
            node.replaceWith("\n")
        soup = BeautifulSoup.BeautifulSoup(unicode(soup))
        return soup

    def is_bold(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        bold.
        """
        return node.findParent(["b", "strong"]) is not None or self.css.is_bold(node)

    def is_italic(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        italic.
        """
        return node.findParent(["em", "i"]) is not None or self.css.is_italic(node)

    def is_sub(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        sub.
        """
        return node.findParent(["sub"]) is not None or self.css.is_sub(node)

    def is_super(self, node):
        """
        Return true if the BeautifulSoup node needs to be rendered as
        super.
        """
        return node.findParent(["sup"]) is not None or self.css.is_super(node)

    def url(self, node):
        """
        return the url of a BeautifulSoup node or None if there is no
        url.
        """
        a_node = node.findParent("a")
        if not a_node:
            return None
        return a_node.get("href")

    def process_text(self, node):
        """
        Return a pyth Text object from a BeautifulSoup node or None if
        the text is empty.
        """
        text = node.string.strip()
        if not text:
            return

        # Set all the properties
        properties = dict()
        if self.is_bold(node):
            properties["bold"] = True
        if self.is_italic(node):
            properties["italic"] = True
        if self.url(node):
            properties["url"] = self.url(node)
        if self.is_sub(node):
            properties["sub"] = True
        if self.is_super(node):
            properties["super"] = True

        content = [node.string]

        return document.Text(properties, content)

    def process_into(self, node, obj):
        """
        Process a BeautifulSoup node and fill its elements into a pyth
        base object.
        """
        if isinstance(node, BeautifulSoup.NavigableString):
            text = self.process_text(node)
            if text:
                obj.append(text)
            return
        if node.name == "p":
            # add a new paragraph into the pyth object
            new_obj = document.Paragraph()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == "ul":
            # add a new list
            new_obj = document.List()
            obj.append(new_obj)
            obj = new_obj
        elif node.name == "li":
            # add a new list entry
            new_obj = document.ListEntry()
            obj.append(new_obj)
            obj = new_obj
        for child in node:
            self.process_into(child, obj)