Example #1
0
def test_strip_control_characters_non_printing_characters():
    unnormalised_string = "A string with non-printing characters in​c\u200Bluded\ufeff"
    assert strip_control_characters(
        unnormalised_string
    ) == "A string with non-printing characters included"
    assert normalise_text(unnormalised_string
                          ) == "A string with non-printing characters included"
def test_node_index_assignment():
    """Whitelisted elements should get an appropriate index but bare strings should not."""
    html = """
        <div>
            <p>Some text</p>
            <p></p>
            Some bare text
        </div>
    """.strip()
    soup = BeautifulSoup(html, 'html.parser')
    normalised_strings = [
        normalise_text(str(add_node_indexes(elem)))
        for elem in soup.find_all("div")[0].children
    ]
    normalised_strings = [s for s in normalised_strings if s]
    assert normalised_strings == [
        '<p data-node-index="0">Some text</p>', '<p data-node-index="0"></p>',
        'Some bare text'
    ]
Example #3
0
def test_strip_control_characters_tab():
    unnormalised_string = "A string with tabs\tin​c\u200Bluded\ufeff"
    assert strip_control_characters(
        unnormalised_string) == "A string with tabs\tincluded"
    assert normalise_text(unnormalised_string) == "A string with tabs included"
Example #4
0
def test_strip_control_characters_ff():
    unnormalised_string = "A string with form feed\fin​c\u200Bluded\ufeff"
    assert strip_control_characters(
        unnormalised_string) == "A string with form feed\fincluded"
    assert normalise_text(
        unnormalised_string) == "A string with form feed included"
Example #5
0
def test_strip_control_characters_cr_lf():
    unnormalised_string = "A string with new lines\r\nin​c\u200Bluded\ufeff"
    assert strip_control_characters(
        unnormalised_string) == "A string with new lines\r\nincluded"
    assert normalise_text(
        unnormalised_string) == "A string with new lines included"
Example #6
0
def test_text_normalisation():
    unnormalised_string = "Ame\u0301lie   Poulain"
    assert normalise_text(unnormalised_string) == "Amélie Poulain"