Exemple #1
0
    def test_one_node(self):
        nodes = [
            NodeSequence([
                Node("This is beginning ", "and end of sentence.", 1, "p"),
            ])
        ]

        expected = "\nThis is beginning and end of sentence."

        content = to_text(nodes)

        self.assertEqual(content, expected)
Exemple #2
0
    def test_multiple_nodes(self):
        nodes = [
            NodeSequence([
                Node("This is beginning ", "and end of sentence. ", 1, "p"),
                Node("Second ", "sentence.", 1, "div"),
                Node("", "Third.", 1, "p"),
            ])
        ]

        expected = "\nThis is beginning and end of sentence. Second sentence.\nThird."

        content = to_text(nodes)

        self.assertEqual(content, expected)
Exemple #3
0
    def test_two_sections(self):
        html = """
            <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
            <head>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
            </head>
            <body>
                <h2>First title</h2>
                <div>
                    <p>This is first sentence.</p>
                    <p>This is second sentence.</p>
                </div>
                <h2>Second title</h2>
                <div>
                    <div>
                        <p>This is fourth sentence.</p>
                        <p>This is fifth sentence.</p>
                    </div>
                </div>
            </body>
        """

        expected = [
            NodeSequence([
                Node("This is first sentence.", "", 3, "p"),
                Node("This is second sentence.", "", 4, "p")
            ]),
            NodeSequence([
                Node("This is fourth sentence.", "", 9, "p"),
                Node("This is fifth sentence.", "", 10, "p")
            ])
        ]

        content = parse(html)
        remove_trailing_whitespaces(content)

        self.assertEqual(content, expected)
Exemple #4
0
    def test_single_p(self):
        html = """
            <body>
                <h1>Page title</h1>
                <div>
                    <p>This is first sentence.</p>
                </div>
            </body>
        """

        expected = [NodeSequence([
            Node("This is first sentence.", "", 3, "p"),
        ])]

        content = parse(html)
        remove_trailing_whitespaces(content)

        self.assertEqual(content, expected)
Exemple #5
0
    def test_two_consecutive_div(self):
        html = """
            <body>
                <h1>Page title</h1>
                <div>
                    <div>This is first sentence.</div>
                    <div>This is second sentence.</div>
                </div>
            </body>
        """

        expected = [NodeSequence([
            Node("This is first sentence.", "", 3, "div"),
            Node("This is second sentence.", "", 4, "div")
        ])]

        content = parse(html)
        remove_trailing_whitespaces(content)

        self.assertEqual(content, expected)
Exemple #6
0
    def test_links(self):
        html = """
            <body>
                <h1>Page title</h1>
                <div>
                    <p>This is first sentence.</p>
                    <p>This is second sentence. Here <a>are</a> multiple <a>links</a> to different <a>resources</a>.</p>
                </div>
            </body>
        """

        expected = [NodeSequence(nodes=[
            Node(text="This is first sentence.", tail="", visit_time=3, tag="p"),
            Node(text="This is second sentence. Here", tail="", visit_time=4, tag="p"),
            Node(text="are", tail="multiple", visit_time=5, tag="a"),
            Node(text="links", tail="to different", visit_time=6, tag="a"),
            Node(text="resources", tail=".", visit_time=7, tag="a")])
        ]

        content = parse(html)
        remove_trailing_whitespaces(content)

        self.assertEqual(content, expected)
Exemple #7
0
    def test_article_tag(self):
        html = """
            <body>
                <h1>Page title</h1>
                <div>
                    <p>This is first ignore sentence.</p>
                    <p>This is second ignored sentence.</p>
                </div>
                <article>
                    <p>This is first sentence.</p>
                    <p>This is second sentence.</p>
                </article>
            </body>
        """

        expected = [NodeSequence([
            Node("This is first sentence.", "", 2, "p"),
            Node("This is second sentence.", "", 3, "p")
        ])]

        content = parse(html)
        remove_trailing_whitespaces(content)

        self.assertEqual(content, expected)