def test_sanitize_post_handles_tag_case_mismatch(): """Previous version of sanitize post froze due to case mismatch in tags. In this particular case, it was the <pre> ... </prE> that cause exponential backtracking (we think) to kick in. """ text =\ '''<p><em>"I didn't like this because I have only two C files and it seemed very odd to split the source base at the language level like this"</em></p> <p>Why does it seem odd? Consider this project:</p> <pre> project1\src\java project1\src\cpp project1\src\python </pre> <p>Or, if you decide to split things up into modules:</p> <p><pre> project1\module1\src\java project1\module1\src\cpp project1\module2\src\java project1\module2\src\python </prE></p> <p>I guess it's a matter of personal taste, but the above structure is fairly common, and I think it works quite well once you get used to it.</p>''' util.sanitize_post(text)
def test_sanitize_post_removes_linefeeds(): text = "This is a text with \r\n some \u2028 nbbbb \u2029 random \n linefeeds \r and carriege returns \r\n hello \n" sanitized = util.sanitize_post(text) assert '\n' not in sanitized assert '\r' not in sanitized assert '\u2028' not in sanitized assert '\u2029' not in sanitized
def test_sanitize_post_replaces_all_whitespace_with_single_spaces(): sanitized = util.sanitize_post( post_base_text.format(code_segment, pre_segment, blockquote_segment)) counter = 0 for ws in re.findall('\s+', sanitized): counter += 1 assert ws == ' ' assert counter # meta assert
def test_sanitize_post_md_code_pattern_is_not_greedy(): """Test that the markdown code pattern does not remove too much.""" post = ("`this is code` but a greedy```other code``` pattern\nwould remove" "`this whole post`" "```along with``` this as well```hehe```") expected = "but a greedy pattern would remove this as well" sanitized = util.sanitize_post(post) assert sanitized == expected
def test_sanitize_post_removes_url(): https_url = "https://hello.world#aweseaf45we23.com" http_url = "http://blabla.com#badonk" c = "{} and other stuff {} awesome donk {}\n\nhurrdurr".format( comment, https_url, http_url) sanitized = util.sanitize_post(c) assert https_url not in sanitized assert http_url not in sanitized
def test_sanitize_post_removes_triple_backtick_code(): markdown_code = '```for i in range(10):\n print(i)```' c = "{} blablabla bla 234 d23r23 {}\nAnd just the finishing touch.".format( comment, markdown_code) sanitized = util.sanitize_post(c) assert markdown_code not in sanitized assert '`' not in sanitized # and some subpatterns assert 'for i in range' not in sanitized assert 'range(10)' not in sanitized
def _post_xml_row_to_model(elem, question_ids: Set[int] = None, target_post_type: PostType = PostType.QUESTION): """Convert an xml row from the Posts.xml file to a model. Text is sanitized before conversion. question_ids is only applicable if the target post type is PostType.ANSWER. An answer is only added if its parent_id is contained in question_ids. """ try: post_type = PostType(int(elem.attrib['PostTypeId'])) except ValueError: # was not a question or answer return None # early returns if target_post_type != post_type: return None if target_post_type == PostType.ANSWER and int( elem.attrib['ParentId']) not in question_ids: return None try: sanitized = sanitize_post(elem.attrib['Body']) except ValueError: LOGGER.error( f"Sanitization failed for Post with Id={elem.attrib['Id']}") return None date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date if post_type == PostType.ANSWER: title = None tags = None parent_id = elem.attrib['ParentId'] else: # is question title = elem.attrib['Title'] tags = elem.attrib['Tags'] parent_id = None post = Post(id=elem.attrib['Id'], creation_date=date, post_type_id=post_type.value, title=title, text=sanitized, tags=tags, parent_id=parent_id) return post
def test_sanitize_real_post(): """Test sanitizing a real post (answer) from SO, authored by Simon Larsén.""" text =\ """<p>You can do this in just two lines.</p> <pre><code>with open('path/to/file') as f: line_lists = [list(line.strip()) for line in f] </code></pre> <p><code>list</code> on a <code>str</code> object will return a list where each character is an element (as a <code>char</code>). <code>line</code> is stripped first, which removes leading and trailing whitespace. This is assuming that you actually want the characters as <code>char</code>. If you want them parsed to <code>int</code>, this will work:</p> <pre><code>with open('path/to/file') as f: line_lists = [[int(x) for x in line.strip()] for line in f] </code></pre> <p>Mind you that there should be some error checking here, the above example will crash if any of the characters cannot be parsed to int.</p> """ expected = "You can do this in just two lines. on a object will return a list where each character is an element (as a ). is stripped first, which removes leading and trailing whitespace. This is assuming that you actually want the characters as . If you want them parsed to , this will work: Mind you that there should be some error checking here, the above example will crash if any of the characters cannot be parsed to int." sanitized = util.sanitize_post(text) assert sanitized == expected
def test_fill_database(): migrate_data.fill_database(questions_xml=TEST_POSTS_XML, answers_xml=TEST_POSTS_XML, comments_xml=TEST_COMMENTS_XML) session = database.Session() # check comments # chec posts for _, elem in ElementTree.iterparse(TEST_POSTS_XML): if elem.tag == 'row': actual_elem = session.query(database.Post).get( int(elem.attrib['Id'])) assert actual_elem.text == util.sanitize_post(elem.attrib['Body']) assert actual_elem.post_type_id == int(elem.attrib['PostTypeId']) if actual_elem.post_type_id == database.PostType.ANSWER.value: assert actual_elem.parent_id == int(elem.attrib['ParentId']) for _, elem in ElementTree.iterparse(TEST_COMMENTS_XML): if elem.tag == 'row': actual_elem = session.query(database.Comment).get( int(elem.attrib['Id'])) assert actual_elem.text == util.sanitize_comment( elem.attrib['Text']) assert actual_elem.post_id == int(elem.attrib['PostId'])
def test_sanitize_post_removes_blockquote_segments(): text = post_base_text.format(blockquote_segment, "\n", "") expected_text = post_base_text.format("", " ", "") sanitized = util.sanitize_post(text) assert sanitized == expected_text
def test_sanitize_post_removes_code_pre_and_tags(): text = post_base_text.format("</a href=https://url.com>", code_segment, pre_segment) expected_text = post_base_text.format("", "", "") res = util.sanitize_post(text) assert res == expected_text
def test_sanitize_post_removes_pre_segments(): text = post_base_text.format("\n", pre_segment, "\n") # the two newlines are replaced with single space expected_text = post_base_text.format(" ", "", "") res = util.sanitize_post(text) assert res == expected_text