def test_html_display(): """This tests `text_data.Corpus.display_search_results`.""" corpus = Corpus(["The cat ran to the dog", "The dog likes bones"]) # this should only show one result html_display, num_results = corpus._show_html_occurrences("the dog", max_results=1) assert html_display == ("<p><b>Showing Result 0 (Document ID 1)</b></p>" "<p style='white-space=pre-wrap;'>" "<b>The</b> <b>dog</b> likes bones" "</p>") assert num_results == 1 next_result = ("<p><b>Showing Result 1 (Document ID 0)</b></p>" "<p style='white-space=pre-wrap;'>" "<b>The</b> cat ran to <b>the</b> <b>dog</b>" "</p>") # now, return all results all_results, total_count = corpus._show_html_occurrences("the dog") assert all_results == html_display + next_result assert total_count == 2 # if you narrow the window size, return only nearby results. small_window, _ = corpus._show_html_occurrences("the dog", window_size=2) assert small_window == ("<p><b>Showing Result 0 (Document ID 1)</b></p>" "<p style='white-space=pre-wrap;'>" "<b>The</b> <b>dog</b> l<b>…</b>" "</p>" "<p><b>Showing Result 1 (Document ID 0)</b></p>" "<p style='white-space=pre-wrap;'>" "<b>…</b><b>the</b> <b>dog</b>" "</p>")
def test_ngram_index(sep, prefix, suffix, default, bigram): """Tests capabilities adding n-grams (`text_data.Corpus.add_ngram_index`).""" corpus = Corpus(["of the best or of the worst"]) assert 2 not in corpus.ngram_indexes corpus.add_ngram_index(2, default, sep, prefix, suffix) assert 2 in corpus.ngram_indexes assert corpus.ngram_indexes[2].most_common(1) == [(bigram, 2)]
def test_document_ordering(query, output): """Tests that the document ordering of `ranked_search` follows TF-IDF.""" document_words = [ ["example"] * 80 + ["search"] * 5, ["search"] * 5, ["example"] + ["search"] * 5, ] corpus = Corpus([" ".join(doc) for doc in document_words]) ranked = corpus.ranked_search(query) documents = [doc[0].doc_id for doc in ranked] assert documents == output
def test_document_search(query, output): """This makes sure that searching for documents works as expected.""" corpus = Corpus([ "The truth is out there", "Truth is, I don't know", "Is it truth?", "Truth", "He is no friend of the truth", "is it", "the cat is happy", ]) assert corpus.search_documents(query) == output
def test_chunking(): """Tests `text_data.Corpus.chunks`.""" documents = ["I ran to the park with the baseball."] * 2 for corpus in Corpus.chunks(documents, chunksize=1): assert corpus.most_common(1) == [("the", 2)] assert corpus.vocab_size == 7 assert corpus.num_words == 8 assert len(corpus) == 1
def test_search_metrics(query, document_count, occurrence_count, document_freq): """Tests how well the search metrics work. The metrics are `Corpus.search_document_count`, `Corpus.search_document_freq`, and `Corpus.search_occurrence_count`. """ documents = [ "the food fight", "the fight for food", "the boxing fight", "the food parade", "food food food", ] corpus = Corpus(documents) assert corpus.search_document_count(query) == document_count assert corpus.search_occurrence_count(query) == occurrence_count assert corpus.search_document_freq(query) == document_freq
def test_basic_initialization(): """This makes sure that initializing the Corpus works as expected.""" corpus = Corpus(["I ran to the park with the baseball."]) assert corpus.most_common(1) == [("the", 2)] assert corpus.vocab == { "i", "ran", "to", "the", "park", "with", "baseball" } assert corpus.vocab_size == 7 assert corpus.num_words == 8 assert len(corpus) == 1 assert corpus.word_count("the") == 2 assert corpus.word_frequency("the") == 0.25 assert corpus.document_count("the") == 1 assert corpus.document_frequency("the") == 1.0 assert corpus.term_count("the", 0) == 2
def test_update(): """This makes sure you can update documents.""" documents = ["example document"] * 5 corpus = Corpus(documents) assert len(corpus) == 5 corpus.add_ngram_index(n=2) assert len(corpus.ngram_indexes[2]) == 5 corpus.update(documents) assert len(corpus) == 10 assert len(corpus.ngram_indexes[2]) == 10
def test_empty_corpus(): """Initializing and updating a `Corpus` with no documents should work.""" corpus = Corpus([]) assert len(corpus) == 0 assert corpus.vocab == set() assert corpus.most_common() == [] assert corpus.vocab_size == 0 assert corpus.num_words == 0 corpus.update([]) assert len(corpus) == 0 assert corpus.vocab == set() assert corpus.most_common() == [] assert corpus.vocab_size == 0 assert corpus.num_words == 0
def test_document_positioning(query, output): """This makes sure that the occurrences within the ranked search appear in the right order.""" corpus = Corpus(["The cat and the hat and another cat"]) ranked = corpus.ranked_search(query) assert ranked == output
def test_phrase_search(query, output): """Makes sure searching for individual instances of a query works.""" corpus = Corpus([ "The dog ran to the cat", "The dog ran to the other dog", "The cat sat" ]) assert corpus.search_occurrences(query) == output