def test_strip_control_characters_non_printing_characters(): unnormalised_string = "A string with non-printing characters inc\u200Bluded\ufeff" assert strip_control_characters( unnormalised_string ) == "A string with non-printing characters included" assert normalise_text(unnormalised_string ) == "A string with non-printing characters included"
def test_node_index_assignment(): """Whitelisted elements should get an appropriate index but bare strings should not.""" html = """ <div> <p>Some text</p> <p></p> Some bare text </div> """.strip() soup = BeautifulSoup(html, 'html.parser') normalised_strings = [ normalise_text(str(add_node_indexes(elem))) for elem in soup.find_all("div")[0].children ] normalised_strings = [s for s in normalised_strings if s] assert normalised_strings == [ '<p data-node-index="0">Some text</p>', '<p data-node-index="0"></p>', 'Some bare text' ]
def test_strip_control_characters_tab(): unnormalised_string = "A string with tabs\tinc\u200Bluded\ufeff" assert strip_control_characters( unnormalised_string) == "A string with tabs\tincluded" assert normalise_text(unnormalised_string) == "A string with tabs included"
def test_strip_control_characters_ff(): unnormalised_string = "A string with form feed\finc\u200Bluded\ufeff" assert strip_control_characters( unnormalised_string) == "A string with form feed\fincluded" assert normalise_text( unnormalised_string) == "A string with form feed included"
def test_strip_control_characters_cr_lf(): unnormalised_string = "A string with new lines\r\ninc\u200Bluded\ufeff" assert strip_control_characters( unnormalised_string) == "A string with new lines\r\nincluded" assert normalise_text( unnormalised_string) == "A string with new lines included"
def test_text_normalisation(): unnormalised_string = "Ame\u0301lie Poulain" assert normalise_text(unnormalised_string) == "Amélie Poulain"