コード例 #1
0
 def test_empty_title(self):
     splitter = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("empty_title.html"))
     expected = [
         {
             "content":
             "\n\nIntroduction\n\n    \n\nText that should be in the introduction.\n\n    ",
             "header_type": "h1",
             "title": "Cover Page",
             "id": None,
         },
         {
             "title": "Installation",
             "content": "\n    a\n    \n    b\n\n    ",
             "header_type": "h1",
             "id": "_Ref2A4E1AB689A0D2EE52FF15610E2D8283",
         },
         {
             "title": "Removal",
             "content": "\n    c\n    \n    d\n  ",
             "header_type": "h1",
             "id": "_Re2D8283"
         },
     ]
     self.assertEqual(splitter.split(), expected)
コード例 #2
0
    def test_split_example(self):
        splitter = HtmlSplitter(
            path=Path(FIXTURE_DIR).joinpath("example.html"))
        expected = {
            "title": "Mentions légales",
            "content": """
<blockquote>
<p><span id="_Refe94959a88d0482b88316b858f9aa3f4e" class="anchor"></span>Sous réserve de disponibilité et
    de modifications techniques.</p>
<p>Toute communication ou reproduction, diffusion et/ou modification de ce document ainsi que toute exploitation ou
    communication de son contenu sont interdites, sauf autorisation expresse. Tout manquement à cette règle est
    illicite et expose son auteur au versement de dommages et intérêts. Tous les droits sont réservés en cas de
    délivrance d’un brevet, d’enregistrement d’un modèle d’utilité ou d’un modèle de design.</p>
<p>Edité par</p>
<p>Siemens Switzerland Ltd.</p>
<p>Building Technologies Division</p>
<p>International Headquarters</p>
<p>Gubelstrasse 22</p>
<p>CH-6301 Zug</p>
<p>Tel. +41 41 724-2424</p>
<p>www.siemens.com/buildingtechnologies</p>
<p>Edition: 2015-12-15</p>
<p>ID document: 009026_h_fr_--<br />
<br />
© Siemens Switzerland Ltd, 2006</p>
</blockquote>
""",
            "header_type": "h1",
            "id": "mentions-légales",
        }
        self.assertEqual(splitter.split()[1], expected)
コード例 #3
0
 def test_malformed(self):
     expected = [
         {
             "content": "\nIntroduction\n\n",
             "header_type": "h3",
             "title": "Cover Page",
             "id": None
         },
         {
             "title":
             "Heading 3",
             "header_type":
             "h3",
             "content":
             "\na\n",
             "children": [{
                 "title": "Heading 6",
                 "header_type": "h6",
                 "content": "\nb\n",
                 "id": None
             }],
             "id":
             None,
         },
     ]
     for file in ["malformed.html", "malformed2.html"]:
         headers = HtmlSplitter(
             path=Path(FIXTURE_DIR).joinpath(file)).split()
         self.assertEqual(headers, expected, "In {}".format(file))
     expected[1][
         "content"] = '\na\n</body class="page-background">\n<h6>Heading 6</h6>\nb\n'
     del expected[1]["children"]
     headers = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("malformed3.html")).split()
     self.assertEqual(headers, expected)
コード例 #4
0
 def test_simple_split_with_text(self):
     splitter = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("headings_simple.html"))
     expected = [
         {
             "content": "\nz\n",
             "header_type": "h1",
             "title": "Cover Page",
             "id": None
         },
         {
             "content":
             "\na\n",
             "children": [{
                 "content":
                 "\nb\n",
                 "children": [{
                     "content": "\nc\n",
                     "header_type": "h3",
                     "title": "Heading 1-2-3",
                     "id": None
                 }],
                 "header_type":
                 "h2",
                 "title":
                 "Heading 1-2",
                 "id":
                 None,
             }],
             "header_type":
             "h1",
             "title":
             "Heading 1",
             "id":
             None,
         },
         {
             "content": "\nd\n",
             "header_type": "h1",
             "title": "Heading 1.2",
             "id": None
         },
         {
             "content":
             "\ne\n",
             "children": [{
                 "content": "\nf\n",
                 "header_type": "h2",
                 "title": "Heading 1.3-2",
                 "id": None
             }],
             "header_type":
             "h1",
             "title":
             "Heading 1.3",
             "id":
             None,
         },
     ]
     self.assertEqual(splitter.split(), expected)
コード例 #5
0
 def test_simple_headings(self):
     splitter = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("standard_headings.html"))
     expected = [
         {
             "content": "\n\nIntroduction text document.\n\n",
             "title": "Cover Page",
             "header_type": "h1",
             "id": None
         },
         {
             "content":
             "\n\nIntroduction text 1a\n\n",
             "children": [
                 {
                     "content": "\n\nParagraph text 1a-2a\n\n",
                     "header_type": "h2",
                     "title": "Heading 1a-2a",
                     "id": None,
                 },
                 {
                     "content": "\n\nParagraph text 1a-2b\n\n",
                     "header_type": "h2",
                     "title": "Heading 1a-2b",
                     "id": None,
                 },
             ],
             "header_type":
             "h1",
             "title":
             "Heading 1a",
             "id":
             None,
         },
         {
             "content":
             "\n\nIntroduction text 1b\n\n",
             "children": [
                 {
                     "content": "\n\nParagraph text 1b-2a\n\n",
                     "header_type": "h2",
                     "title": "Heading 1b-2a",
                     "id": None,
                 },
                 {
                     "content": "\n\nParagraph text 1b-2b\n\n",
                     "header_type": "h2",
                     "title": "Heading 1b-2b",
                     "id": None,
                 },
             ],
             "header_type":
             "h1",
             "title":
             "Heading 1b",
             "id":
             None,
         },
     ]
     self.assertEqual(splitter.split(), expected)
コード例 #6
0
 def test_disgusting_mammoth_output(self):
     h1_title = (
         '<a id="_Toc126736820"></a><a id="_Toc127339768"></a><a id="_Toc315192142"></a>'
         '<a id="_Toc424140850"></a>Stellsignal stetig (AO 0-10V)')
     h2_title = (
         '<a id="_Toc126736819"></a><a id="_Toc127339767"></a>'
         '<a id="_Toc315192141"></a><a id="_Toc424140849"></a>Analoge Ausgänge'
     )
     expected = [{
         "content":
         "",
         "children": [{
             "content": "",
             "header_type": "h2",
             "title": "Stellsignal stetig (AO 0-10V)",
             "id": "_Toc126736820",
         }],
         "header_type":
         "h1",
         "title":
         "Analoge Ausgänge",
         "id":
         "_Toc126736819",
     }]
     headers = HtmlSplitter("<h1>%s</h1><h2>%s</h2>" %
                            (h2_title, h1_title)).split()
     self.assertEqual(headers, expected)
コード例 #7
0
    def test_anchor(self):
        headers = HtmlSplitter(
            path=Path(FIXTURE_DIR).joinpath("anchor.html")).split()
        expected = [
            {
                "content": "\n\nIntroduction\n\n",
                "header_type": "h1",
                "title": "Cover Page",
                "id": None
            },
            {
                "content": '\n\nHeading 1\n\n<a href="heading2"></a>\n\n',
                "header_type": "h1",
                "title": "Heading 1",
                "id": "heading1",
            },
            {
                "content": """

Heading 2

<a href="heading1"></a>

<a href="https://google.com/#Heading2">Clique</a>

""",
                "header_type": "h1",
                "title": "Heading 2",
                "id": "heading2",
            },
        ]
        self.assertEqual(headers, expected)
コード例 #8
0
 def test_resource_creation(self):
     splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("images.html"))
     html2topics = HtmlToTopics(splitter, render_cover_page=True)
     topics = html2topics.topics
     self.assertEqual(len(html2topics.resources), 6)
     self.assertEqual(len(topics), 1)
     self.assertEqual(len(topics[0].children), 12)
     self.assertEqual(topics[0].children[1].title, "The alt Attribute ")
コード例 #9
0
 def test_simple_split(self):
     splitter = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("heading.html"))
     expected = [
         {
             "content": "\n\nIntroduction\n\n",
             "header_type": "h1",
             "title": "Cover Page",
             "id": None
         },
         {
             "title": "My First Heading",
             "header_type": "h1",
             "content": "\n\n<p>My first paragraph.</p>\n\n",
             "id": None,
         },
     ]
     self.assertEqual(splitter.split(), expected)
コード例 #10
0
def ft_content_from_html_content(html_content, title, use_ftml,
                                 render_cover_page, path):
    if use_ftml and not FTML_AVAILABLE:
        raise ModuleNotFoundError(
            "Please install the FTML connector in order to use FTML.")
    if use_ftml:
        topic = TopicBuilder().title(Metadata.title(title)).content(
            html_content).origin_id("0").build()
        topics = [TopicsSplitter().split(topic)]
        nodes = PublicationConverter().convert_toc(topics)
        content = StructuredContent(toc=nodes,
                                    editorial_type=EditorialType.DEFAULT)
        resources = []
    else:
        if path:
            splitter = HtmlSplitter(path=path)
        else:
            splitter = HtmlSplitter(content=html_content)
        html2topics = HtmlToTopics(splitter,
                                   render_cover_page=render_cover_page)
        content = StructuredContent(toc=(html2topics.topics),
                                    editorial_type=EditorialType.DEFAULT)
        resources = html2topics.resources
    return content, resources
コード例 #11
0
    def test_heading(self):
        splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("heading.html"))
        html2topics = HtmlToTopics(splitter, render_cover_page=True)
        expected = [
            NeoTopic(title="Cover Page", content="\n\nIntroduction\n\n"),
            NeoTopic(
                title="My First Heading",
                content="""

<p>My first paragraph.</p>

""",
            ),
        ]
        self.assertEqual(len(html2topics.resources), 0)
        self.assertEqual(html2topics.topics, expected)
コード例 #12
0
 def test_anchor_multiple_id(self):
     headers = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath(
         "anchor_multiple_id.html")).split()
     expected = [
         {
             "content": "\n\nIntroduction\n\n",
             "header_type": "h1",
             "title": "Cover Page",
             "id": None
         },
         {
             "content":
             "\n"
             "\n"
             "Heading 1\n"
             "\n"
             '<a href="heading1.1"></a>\n'
             '<a href="heading1.1"></a>\n'
             '<a href="heading1.1"></a>\n'
             "\n"
             "\n",
             "header_type":
             "h1",
             "title":
             "Heading 1\n    \n\n",
             "id":
             "heading1.1",
         },
         {
             "content":
             "\n"
             "\n"
             "Heading 2\n"
             "\n"
             '<a href="heading2.1"></a>\n'
             '<a href="heading2.1"></a>\n'
             '<a href="heading2.1"></a>\n'
             "\n",
             "header_type":
             "h1",
             "title":
             "Heading 2\n    \n\n",
             "id":
             "heading2.1",
         },
     ]
     self.assertEqual(headers, expected)
コード例 #13
0
 def test_real_world_example(self):
     headers = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("iphone5repare.html")).split()
     h1s = [h for h in headers if h["header_type"] == "h1"]
     self.assertEqual(len(h1s), 2)
     h2s = h1s[1]["children"]
     h3s = []
     number_of_h2 = 0
     number_of_h2_children = 0
     number_of_h3 = 0
     for supposedly_h2 in h2s:
         if supposedly_h2["header_type"] == "h2":
             h3s += supposedly_h2.get("children", [])
             number_of_h2 += 1
     self.assertEqual(number_of_h2, 31)
     for supposedly_h3 in h3s:
         number_of_h2_children += 1
         if supposedly_h3["header_type"] == "h3":
             number_of_h3 += 1
     self.assertEqual(number_of_h2_children, 44)
     self.assertEqual(
         [h["title"] for h in h3s],
         ["Outils", "Pièces"] + ["Ajouter un commentaire"] * 10 +
         ["Kits pour Nintendo Switch\n"] * 2 +
         ["Ajouter un commentaire"] * 19 + [
             "+13                  ",
             "Auteur\n",
             "Équipe",
             "209 commentaires         ",
             "Intégrer ce tutoriel",
             "Aperçu",
             "iFixit",
             "Boutiques",
             "Réparabilité",
             "Plaidoyer",
             "Restez au courant",
         ],
     )
     for header in headers:
         if header["header_type"] == "h1":
             first_h1 = header
     self.assertEqual(first_h1.get("title"),
                      "Comment remplacer la batterie de l'iPhone 5s")
     self.assertEqual(first_h1.get("header_type"), "h1")
     h2s_of_first_h1 = first_h1.get("children")
     self.assertEqual(len(h2s_of_first_h1), 32)
     self.assertEqual(h2s_of_first_h1[4].get("title"), "\nÉtape 2\n\n")
コード例 #14
0
 def test_init(self):
     HtmlSplitter(
         '<a href="http://example.com/?foo=val1&bar=val2">A link</a>')
     with self.assertRaises(ValueError) as e:
         HtmlSplitter("<div>/div>", "/tmp/path/to_file.html")
     self.assertEqual(str(e.exception),
                      "Choose only one between <content> and <path>.")
     with self.assertRaises(ValueError) as e:
         HtmlSplitter()
     self.assertEqual(str(e.exception),
                      "Choose at least one between <content> and <path>.")
     with self.assertRaises(FileNotFoundError) as e:
         HtmlSplitter(path="this/path/does/not/exists.html")
     self.assertEqual(
         str(e.exception),
         "[Errno 2] No such file or directory: 'this/path/does/not/exists.html'"
     )
     splitter = HtmlSplitter(
         path=Path(FIXTURE_DIR).joinpath("example.html"))
     self.assertIsNotNone(splitter.content)
     splitter = HtmlSplitter(content="")
     self.assertEqual(splitter.split(), [])
コード例 #15
0
 def test_empty_title(self):
     splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("empty_title.html"))
     html2topics = HtmlToTopics(splitter, render_cover_page=True)
     expected = [
         NeoTopic(
             title="Cover Page",
             origin_id=None,
             content="\n\nIntroduction\n\n    \nText that should be in the introduction.\n\n    ",
         ),
         NeoTopic(
             title="Installation",
             origin_id="_Ref2A4E1AB689A0D2EE52FF15610E2D8283",
             content="\n    a\n    \n    b\n\n    ",
         ),
         NeoTopic(title="Removal", origin_id="_Re2D8283", content="\n    c\n    \n    d\n  "),
     ]
     self.assertEqual(len(html2topics.resources), 0)
     self.assertEqual(html2topics.topics, expected)
コード例 #16
0
 def test_heading_three_level(self):
     splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("heading_three_levels.html"))
     html2topics = HtmlToTopics(splitter, render_cover_page=True)
     expected = [
         NeoTopic(title="Cover Page", content="\n\nIntroduction\n\n"),
         NeoTopic(
             title="Heading 1",
             content="\na\n",
             children=[
                 NeoTopic(
                     title="Heading 1-2",
                     content="\nb\n",
                     children=[NeoTopic(title="Heading 1-2-3", content="\nc\n")],
                 )
             ],
         ),
     ]
     self.assertEqual(len(html2topics.resources), 0)
     self.assertEqual(expected, html2topics.topics)
コード例 #17
0
    def test_cover_page(self):
        splitter = HtmlSplitter(
            content="""<!DOCTYPE html>
<html>
<body>
a
<h1>b</h1>
c
</body>
</html>
"""
        )
        html2topics = HtmlToTopics(splitter, render_cover_page=True)
        self.assertEqual(html2topics.resources, [])
        expected = [NeoTopic(title="Cover Page", content="a\n"), NeoTopic(title="b", content="c\n")]
        self.assertEqual(html2topics.topics, expected)
        html2topics = HtmlToTopics(splitter, render_cover_page=False)
        expected = expected[1:]
        self.assertEqual(html2topics.resources, [])
        self.assertEqual(html2topics.topics, expected)
コード例 #18
0
    def test_anchor(self):
        splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("anchor.html"))
        html2topics = HtmlToTopics(splitter, render_cover_page=True)
        expected = [
            NeoTopic(title="Cover Page", origin_id=None, content="\n\nIntroduction\n\n"),
            NeoTopic(title="Heading 1", origin_id="heading1", content='\n\nHeading 1\n\n<a href="heading2"></a>\n\n'),
            NeoTopic(
                title="Heading 2",
                origin_id="heading2",
                content="""
        Heading 2

        <a href="heading1"></a>

        <a href="https://google.com/#Heading2">Clique</a>

        """,
            ),
        ]
        self.assertEqual(len(html2topics.resources), 0)
        self.assertEqual(html2topics.topics, expected)
コード例 #19
0
    def test_empty_without_cover_page(self):
        splitter = HtmlSplitter(
            content="""<!DOCTYPE html>
<html>
<body>
a
b
c
</body>
</html>
"""
        )
        html2topics = HtmlToTopics(splitter, render_cover_page=True)
        self.assertEqual(html2topics.resources, [])
        expected_content = "a\nb\nc\n"
        expected = [NeoTopic(title="Cover Page", content=expected_content)]
        self.assertEqual(html2topics.topics, expected)
        html2topics = HtmlToTopics(splitter, render_cover_page=False)
        expected = [NeoTopic(title="Flat document", content=expected_content)]
        self.assertEqual(html2topics.resources, [])
        self.assertEqual(html2topics.topics, expected)
コード例 #20
0
    def test_real_world_example(self):
        splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("iphone5repare.html"))
        html2topics = HtmlToTopics(splitter, render_cover_page=True)
        self.assertEqual(len(html2topics.resources), 14)
        topics = html2topics.topics
        self.assertEqual(len(topics), 2)
        self.assertEqual(topics[1].title, "Comment remplacer la batterie de l'iPhone 5s")
        self.assertEqual(len(topics[1].children), 32)
        tool = topics[0].children[19]
        self.assertEqual(tool.title, "Outils")
        self.assertEqual(
            repr(tool),
            """<Topic>
    Title:
        Outils
    Content:
        <div><div class="sc-lkqHmb fRhhEx"></div></div>
    Children:
        []
</Topic>
""",
        )
コード例 #21
0
 def test_convoluted(self):
     headers = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath(
         "convoluted_javascripted.html")).split()
     expected = [
         {
             "title":
             "Problem description",
             "header_type":
             "h1",
             "content":
             "\n\nThe problem is that poeple create shitty base HTML.\n\n",
             "children": [
                 {
                     "content": "\n\nThen they format it properly...\n\n",
                     "header_type": "h4",
                     "title": "What is wrong... ",
                     "id": "statement",
                 },
                 {
                     "content": "\n\n... with javascript and CSS.\n\n",
                     "header_type": "h3",
                     "title": "...with semantic HTML ?",
                     "id": None,
                 },
             ],
             "id":
             "firstone",
         },
         {
             "title": "Result",
             "header_type": "h1",
             "content": "\n\nWell shit goes in, shit comes out !\n\n",
             "id": None,
         },
     ]
     self.assertEqual(headers, expected)
コード例 #22
0
 def test_headings(self):
     splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("headings_simple.html"))
     html2topics = HtmlToTopics(splitter, render_cover_page=True)
     expected = [
         NeoTopic(title="Cover Page", content="z"),
         NeoTopic(
             title="Heading 1",
             content="\na\n",
             children=[
                 NeoTopic(
                     title="Heading 1-2", content="\nb\n", children=[NeoTopic(title="Heading 1-2-3", content="c")]
                 )
             ],
         ),
         NeoTopic(title="Heading 1.2", content="\nd\n"),
         NeoTopic(title="Heading 1.3", content="\ne\n", children=[NeoTopic(title="Heading 1.3-2", content="\nf\n")]),
     ]
     self.assertEqual(len(html2topics.resources), 0)
     for i, part in enumerate(html2topics.topics):
         self.assertEqual(
             expected[i],
             part,
             "What we have:\n{}\n\n{}\nWhat we expect:\n{}\n\n".format(part, "-" * 80, expected[i]),
         )
コード例 #23
0
 def test_empty_content(self):
     splitter = HtmlSplitter(content="")
     for render_cover_page in [True, False]:
         html2topics = HtmlToTopics(splitter, render_cover_page=render_cover_page)
         self.assertEqual(html2topics.topics, [])
         self.assertEqual(html2topics.resources, [])
コード例 #24
0
 def test_table_split(self):
     splitter = HtmlSplitter(path=Path(FIXTURE_DIR).joinpath("tables.html"))
     self.assertEqual(splitter.split(), EXPECTED_TABLES_HEADER)