Python Corpus Beispiele

Programmiersprache: Python

Namespace / Paketname: text_data

Klasse / Typ: Corpus

Beispiele auf hotexamples.com: 11

Python Corpus - 11 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die text_data.Corpus, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Corpus(10)

add_ngram_index(2)

update(2)

most_common(2)

ranked_search(2)

search_occurrence_count(1)

word_count(1)

term_count(1)

search_occurrences(1)

search_document_count(1)

search_documents(1)

search_document_freq(1)

_show_html_occurrences(1)

document_frequency(1)

document_count(1)

chunks(1)

word_frequency(1)

Beispiel #1

Datei anzeigen

def test_html_display():
    """This tests `text_data.Corpus.display_search_results`."""
    corpus = Corpus(["The cat ran to the dog", "The dog likes bones"])
    # this should only show one result
    html_display, num_results = corpus._show_html_occurrences("the dog",
                                                              max_results=1)
    assert html_display == ("<p><b>Showing Result 0 (Document ID 1)</b></p>"
                            "<p style='white-space=pre-wrap;'>"
                            "<b>The</b> <b>dog</b> likes bones"
                            "</p>")
    assert num_results == 1
    next_result = ("<p><b>Showing Result 1 (Document ID 0)</b></p>"
                   "<p style='white-space=pre-wrap;'>"
                   "<b>The</b> cat ran to <b>the</b> <b>dog</b>"
                   "</p>")
    # now, return all results
    all_results, total_count = corpus._show_html_occurrences("the dog")
    assert all_results == html_display + next_result
    assert total_count == 2
    # if you narrow the window size, return only nearby results.
    small_window, _ = corpus._show_html_occurrences("the dog", window_size=2)
    assert small_window == ("<p><b>Showing Result 0 (Document ID 1)</b></p>"
                            "<p style='white-space=pre-wrap;'>"
                            "<b>The</b> <b>dog</b> l<b>&hellip;</b>"
                            "</p>"
                            "<p><b>Showing Result 1 (Document ID 0)</b></p>"
                            "<p style='white-space=pre-wrap;'>"
                            "<b>&hellip;</b><b>the</b> <b>dog</b>"
                            "</p>")

Beispiel #2

Datei anzeigen

def test_ngram_index(sep, prefix, suffix, default, bigram):
    """Tests capabilities adding n-grams (`text_data.Corpus.add_ngram_index`)."""
    corpus = Corpus(["of the best or of the worst"])
    assert 2 not in corpus.ngram_indexes
    corpus.add_ngram_index(2, default, sep, prefix, suffix)
    assert 2 in corpus.ngram_indexes
    assert corpus.ngram_indexes[2].most_common(1) == [(bigram, 2)]

Beispiel #3

Datei anzeigen

def test_document_ordering(query, output):
    """Tests that the document ordering of `ranked_search` follows TF-IDF."""
    document_words = [
        ["example"] * 80 + ["search"] * 5,
        ["search"] * 5,
        ["example"] + ["search"] * 5,
    ]
    corpus = Corpus([" ".join(doc) for doc in document_words])
    ranked = corpus.ranked_search(query)
    documents = [doc[0].doc_id for doc in ranked]
    assert documents == output

Beispiel #4

Datei anzeigen

def test_document_search(query, output):
    """This makes sure that searching for documents works as expected."""
    corpus = Corpus([
        "The truth is out there",
        "Truth is, I don't know",
        "Is it truth?",
        "Truth",
        "He is no friend of the truth",
        "is it",
        "the cat is happy",
    ])
    assert corpus.search_documents(query) == output

Beispiel #5

Datei anzeigen

def test_chunking():
    """Tests `text_data.Corpus.chunks`."""
    documents = ["I ran to the park with the baseball."] * 2
    for corpus in Corpus.chunks(documents, chunksize=1):
        assert corpus.most_common(1) == [("the", 2)]
        assert corpus.vocab_size == 7
        assert corpus.num_words == 8
        assert len(corpus) == 1

Beispiel #6

Datei anzeigen

def test_search_metrics(query, document_count, occurrence_count,
                        document_freq):
    """Tests how well the search metrics work.

    The metrics are `Corpus.search_document_count`,
    `Corpus.search_document_freq`, and `Corpus.search_occurrence_count`.
    """
    documents = [
        "the food fight",
        "the fight for food",
        "the boxing fight",
        "the food parade",
        "food food food",
    ]
    corpus = Corpus(documents)
    assert corpus.search_document_count(query) == document_count
    assert corpus.search_occurrence_count(query) == occurrence_count
    assert corpus.search_document_freq(query) == document_freq

Beispiel #7

Datei anzeigen

def test_basic_initialization():
    """This makes sure that initializing the Corpus works as expected."""
    corpus = Corpus(["I ran to the park with the baseball."])
    assert corpus.most_common(1) == [("the", 2)]
    assert corpus.vocab == {
        "i", "ran", "to", "the", "park", "with", "baseball"
    }
    assert corpus.vocab_size == 7
    assert corpus.num_words == 8
    assert len(corpus) == 1
    assert corpus.word_count("the") == 2
    assert corpus.word_frequency("the") == 0.25
    assert corpus.document_count("the") == 1
    assert corpus.document_frequency("the") == 1.0
    assert corpus.term_count("the", 0) == 2

Beispiel #8

Datei anzeigen

def test_update():
    """This makes sure you can update documents."""
    documents = ["example document"] * 5
    corpus = Corpus(documents)
    assert len(corpus) == 5
    corpus.add_ngram_index(n=2)
    assert len(corpus.ngram_indexes[2]) == 5
    corpus.update(documents)
    assert len(corpus) == 10
    assert len(corpus.ngram_indexes[2]) == 10

Beispiel #9

Datei anzeigen

def test_empty_corpus():
    """Initializing and updating a `Corpus` with no documents should work."""
    corpus = Corpus([])
    assert len(corpus) == 0
    assert corpus.vocab == set()
    assert corpus.most_common() == []
    assert corpus.vocab_size == 0
    assert corpus.num_words == 0
    corpus.update([])
    assert len(corpus) == 0
    assert corpus.vocab == set()
    assert corpus.most_common() == []
    assert corpus.vocab_size == 0
    assert corpus.num_words == 0

Beispiel #10

Datei anzeigen

def test_document_positioning(query, output):
    """This makes sure that the occurrences within the ranked search appear in the right order."""
    corpus = Corpus(["The cat and the hat and another cat"])
    ranked = corpus.ranked_search(query)
    assert ranked == output

Beispiel #11

Datei anzeigen

def test_phrase_search(query, output):
    """Makes sure searching for individual instances of a query works."""
    corpus = Corpus([
        "The dog ran to the cat", "The dog ran to the other dog", "The cat sat"
    ])
    assert corpus.search_occurrences(query) == output