def test_query_subtop_to_top(dublicate_posts, small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer(dublicate_posts, small_stop_words, *get_loggers)
    json_result = analyzer.process_query(2023, 2024, 2)
    assert json_result['top'][0][0] == 'sql'
    assert json_result['top'][0][1] == 7
    assert json_result['top'][1][0] == 'java'
    assert json_result['top'][1][1] == 6
def test_output(capsys, small_posts, small_stop_words, small_query_file, get_loggers):
    analyzer = StackoverflowAnalyzer(small_posts, small_stop_words, *get_loggers)
    analyzer.process_queries(small_query_file)

    captured = capsys.readouterr()
    assert '"top": [["better", 30], ["javascript", 20], ["python", 20], ["seo", 15]]' in captured.out
    assert '"start": 2025, "end": 2025, "top": [["seo", 15], ["better", 10]]' in captured.out
    assert captured.out.count('\n') == 2
def test_logger(caplog, small_posts, small_stop_words, small_query_file, get_loggers):
    analyzer = StackoverflowAnalyzer(small_posts, small_stop_words, *get_loggers)
    analyzer.process_queries(small_query_file)
    caplog.set_level("DEBUG")

    assert "got query" in caplog.text
    assert "got query \"2025,2025,2\"" in caplog.text
    assert "got query \"2025,2025,2\"" in caplog.text
    assert 'process XML dataset' in caplog.text
    assert 'ready to serve queries' in caplog.text
    assert 'finish processing queries' in caplog.text
    assert 'process XML dataset' in caplog.text
def test_query_small_dataset(small_posts, small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer(small_posts, small_stop_words, *get_loggers)
    json_result = analyzer.process_query(2025, 2025, 2)
    assert json_result['top'][0][0] == 'seo'
    assert json_result['top'][0][1] == 15
    assert json_result['top'][1][0] == 'better'
    assert json_result['top'][1][1] == 10

    json_result = analyzer.process_query(2025, 2026, 4)
    assert json_result['top'][0][0] == 'better'
    assert json_result['top'][0][1] == 30
    assert json_result['top'][1][0] == 'javascript'
    assert json_result['top'][1][1] == 20
    assert json_result['top'][2][0] == 'python'
    assert json_result['top'][2][1] == 20
    assert json_result['top'][3][0] == 'seo'
    assert json_result['top'][3][1] == 15
def test_stop_words_correctness_empty_file(tmpdir, get_loggers):
    stop_words_file = tmpdir.join("dataset.txt")
    stop_words_file.write(
        dedent("""
        \n
        \n
            """)
    )
    analyzer = StackoverflowAnalyzer("", stop_words_file, *get_loggers)
    assert analyzer.stop_words == []
def test_posts_parse_correctness(small_posts, small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer(small_posts, small_stop_words, *get_loggers)
    posts_dict = analyzer.year_to_words_map

    assert len(posts_dict) == 2
    assert posts_dict.get('2025') is not None
    assert len(posts_dict['2025']) == 6
    assert posts_dict.get('2026') is not None
    assert len(posts_dict['2026']) == 3

    assert posts_dict.get('2010') is None
    assert posts_dict.get('20') is None
    assert posts_dict.get('2018') is None
def test_stop_words_correctness_with_empty_line(tmpdir, get_loggers):
    another_stop_words_file = tmpdir.join("dataset.txt")
    another_stop_words_file.write(
        dedent("""
        is
        \n
        than
        \n
        a
            """)
    )
    analyzer = StackoverflowAnalyzer("", another_stop_words_file, *get_loggers)
    assert set(analyzer.stop_words) == {'a', 'is', 'than'}
def test_posts_words_score_correctness(small_posts, small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer(small_posts, small_stop_words, *get_loggers)
    posts_dict = analyzer.year_to_words_map

    assert posts_dict['2025']['seo'] == 15
    assert posts_dict['2025']['better'] == 10
    assert posts_dict['2025']['done'] == 10
    assert posts_dict['2025']['with'] == 10
    assert posts_dict['2025']['repetition'] == 10
    assert posts_dict['2025']['what'] == 5

    assert posts_dict['2026']['better'] == 20
    assert posts_dict['2026']['python'] == 20
    assert posts_dict['2026']['javascript'] == 20
def test_stop_words_encoding(tmpdir, get_loggers):
    stop_words_file_2 = tmpdir.join("dataset.txt", encoding="koi8-r")
    stop_words_file_2.write(
        dedent("""
            \n
            is
            \n
            the
            at
            a
            but
                """)
    )
    analyzer = StackoverflowAnalyzer("", stop_words_file_2, *get_loggers)
    assert analyzer.stop_words == ["is", 'the', 'at', 'a', 'but']
def test_stop_words_correctness(small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer("", small_stop_words, *get_loggers)
    assert analyzer.stop_words == ['is', 'than']
def test_load_stop_words(small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer("", "", *get_loggers)
    analyzer.load_stop_words(small_stop_words)
    assert len(analyzer.stop_words) == 2
def test_no_exeption_in_parse_posts_file(get_loggers):
    try:
        analyzer = StackoverflowAnalyzer("", "", *get_loggers)
        analyzer.parse_posts_file("somepath")
    except:
        pytest.fail("Unexpected Error ..")
def test_no_exeption_in_constructor(get_loggers):
    try:
        StackoverflowAnalyzer("", "", *get_loggers)
    except:
        pytest.fail("Unexpected Error ..")
def test_query_correctness(dublicate_posts, small_stop_words, get_loggers):
    analyzer = StackoverflowAnalyzer(dublicate_posts, small_stop_words, *get_loggers)
    json_result = analyzer.process_query(2020, 2022, 4)
    assert json_result['top'][0][0] == 'data'
    assert json_result['top'][0][1] == 20