Example #1
0
def test_sanitize_post_handles_tag_case_mismatch():
    """Previous version of sanitize post froze due to case mismatch in tags.
    In this particular case, it was the <pre> ... </prE> that cause exponential
    backtracking (we think) to kick in.
    """
    text =\
'''<p><em>"I didn't like this because I have only two C files and it seemed very odd to split the source base at the language level like this"</em></p>

<p>Why does it seem odd? Consider this project:</p>

<pre>
  project1\src\java
  project1\src\cpp
  project1\src\python
</pre>

<p>Or, if you decide to split things up into modules:</p>

<p><pre>
  project1\module1\src\java
  project1\module1\src\cpp
  project1\module2\src\java
  project1\module2\src\python
</prE></p>

<p>I guess it's a matter of personal taste, but the above structure is fairly common, and I think it works quite well once you get used to it.</p>'''
    util.sanitize_post(text)
Example #2
0
def test_sanitize_post_removes_linefeeds():
    text = "This is a text with \r\n some \u2028 nbbbb \u2029 random \n linefeeds \r and carriege returns \r\n hello \n"
    sanitized = util.sanitize_post(text)
    assert '\n' not in sanitized
    assert '\r' not in sanitized
    assert '\u2028' not in sanitized
    assert '\u2029' not in sanitized
Example #3
0
def test_sanitize_post_replaces_all_whitespace_with_single_spaces():
    sanitized = util.sanitize_post(
        post_base_text.format(code_segment, pre_segment, blockquote_segment))
    counter = 0
    for ws in re.findall('\s+', sanitized):
        counter += 1
        assert ws == ' '
    assert counter  # meta assert
Example #4
0
def test_sanitize_post_md_code_pattern_is_not_greedy():
    """Test that the markdown code pattern does not remove too much."""
    post = ("`this is code` but a greedy```other code``` pattern\nwould remove"
            "`this whole post`"
            "```along with``` this as well```hehe```")
    expected = "but a greedy pattern would remove this as well"
    sanitized = util.sanitize_post(post)
    assert sanitized == expected
Example #5
0
def test_sanitize_post_removes_url():
    https_url = "https://hello.world#aweseaf45we23.com"
    http_url = "http://blabla.com#badonk"

    c = "{} and other stuff {} awesome donk {}\n\nhurrdurr".format(
        comment, https_url, http_url)
    sanitized = util.sanitize_post(c)

    assert https_url not in sanitized
    assert http_url not in sanitized
Example #6
0
def test_sanitize_post_removes_triple_backtick_code():
    markdown_code = '```for i in range(10):\n    print(i)```'
    c = "{} blablabla bla 234 d23r23 {}\nAnd just the finishing touch.".format(
        comment, markdown_code)
    sanitized = util.sanitize_post(c)

    assert markdown_code not in sanitized
    assert '`' not in sanitized
    # and some subpatterns
    assert 'for i in range' not in sanitized
    assert 'range(10)' not in sanitized
Example #7
0
def _post_xml_row_to_model(elem,
                           question_ids: Set[int] = None,
                           target_post_type: PostType = PostType.QUESTION):
    """Convert an xml row from the Posts.xml file to a model. Text is sanitized
    before conversion.
    
    question_ids is only applicable if the target post type is
    PostType.ANSWER. An answer is only added if its parent_id is
    contained in question_ids.
    """
    try:
        post_type = PostType(int(elem.attrib['PostTypeId']))
    except ValueError:  # was not a question or answer
        return None

    # early returns
    if target_post_type != post_type:
        return None
    if target_post_type == PostType.ANSWER and int(
            elem.attrib['ParentId']) not in question_ids:
        return None
    try:
        sanitized = sanitize_post(elem.attrib['Body'])
    except ValueError:
        LOGGER.error(
            f"Sanitization failed for Post with Id={elem.attrib['Id']}")
        return None

    date = MayaDT.from_rfc3339(elem.attrib['CreationDate']).date
    if post_type == PostType.ANSWER:
        title = None
        tags = None
        parent_id = elem.attrib['ParentId']
    else:  # is question
        title = elem.attrib['Title']
        tags = elem.attrib['Tags']
        parent_id = None
    post = Post(id=elem.attrib['Id'],
                creation_date=date,
                post_type_id=post_type.value,
                title=title,
                text=sanitized,
                tags=tags,
                parent_id=parent_id)
    return post
Example #8
0
def test_sanitize_real_post():
    """Test sanitizing a real post (answer) from SO, authored by Simon Larsén."""
    text =\
    """<p>You can do this in just two lines.</p>

    <pre><code>with open('path/to/file') as f:
        line_lists = [list(line.strip()) for line in f]
    </code></pre>

    <p><code>list</code> on a <code>str</code> object will return a list where each character is an element (as a <code>char</code>). <code>line</code> is stripped first, which removes leading and trailing whitespace. This is assuming that you actually want the characters as <code>char</code>. If you want them parsed to <code>int</code>, this will work:</p>

    <pre><code>with open('path/to/file') as f:
        line_lists = [[int(x) for x in line.strip()] for line in f]
    </code></pre>

    <p>Mind you that there should be some error checking here, the above example will crash if any of the characters cannot be parsed to int.</p>
    """
    expected = "You can do this in just two lines. on a object will return a list where each character is an element (as a ). is stripped first, which removes leading and trailing whitespace. This is assuming that you actually want the characters as . If you want them parsed to , this will work: Mind you that there should be some error checking here, the above example will crash if any of the characters cannot be parsed to int."
    sanitized = util.sanitize_post(text)
    assert sanitized == expected
Example #9
0
def test_fill_database():
    migrate_data.fill_database(questions_xml=TEST_POSTS_XML,
                               answers_xml=TEST_POSTS_XML,
                               comments_xml=TEST_COMMENTS_XML)

    session = database.Session()
    # check comments
    # chec posts
    for _, elem in ElementTree.iterparse(TEST_POSTS_XML):
        if elem.tag == 'row':
            actual_elem = session.query(database.Post).get(
                int(elem.attrib['Id']))
            assert actual_elem.text == util.sanitize_post(elem.attrib['Body'])
            assert actual_elem.post_type_id == int(elem.attrib['PostTypeId'])
            if actual_elem.post_type_id == database.PostType.ANSWER.value:
                assert actual_elem.parent_id == int(elem.attrib['ParentId'])

    for _, elem in ElementTree.iterparse(TEST_COMMENTS_XML):
        if elem.tag == 'row':
            actual_elem = session.query(database.Comment).get(
                int(elem.attrib['Id']))
            assert actual_elem.text == util.sanitize_comment(
                elem.attrib['Text'])
            assert actual_elem.post_id == int(elem.attrib['PostId'])
Example #10
0
def test_sanitize_post_removes_blockquote_segments():
    text = post_base_text.format(blockquote_segment, "\n", "")
    expected_text = post_base_text.format("", " ", "")
    sanitized = util.sanitize_post(text)
    assert sanitized == expected_text
Example #11
0
def test_sanitize_post_removes_code_pre_and_tags():
    text = post_base_text.format("</a href=https://url.com>", code_segment,
                                 pre_segment)
    expected_text = post_base_text.format("", "", "")
    res = util.sanitize_post(text)
    assert res == expected_text
Example #12
0
def test_sanitize_post_removes_pre_segments():
    text = post_base_text.format("\n", pre_segment, "\n")
    # the two newlines are replaced with single space
    expected_text = post_base_text.format(" ", "", "")
    res = util.sanitize_post(text)
    assert res == expected_text