예제 #1
0
def test_searching_alice_simple(index_dir):
    """Test searching for Alice with the simple scorer."""
    with open(os.path.abspath('caterpillar/test_resources/alice.txt'), 'rbU') as f:
        f.seek(0)
        data = f.read()
        analyser = TestAnalyser()
        config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser)))
        with IndexWriter(index_dir, config) as writer:
            writer.add_document(text=data, frame_size=2)

        # Merge bigrams
        with IndexReader(index_dir) as reader:
            bigrams = find_bi_gram_words(reader.get_frames())
        with IndexWriter(index_dir) as writer:
            writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams])

        with IndexReader(index_dir) as reader:
            searcher = reader.searcher(scorer_cls=SimpleScorer)
            results = searcher.search(QSQ('Alice or Caterpillar'))
            assert results[0].score == 2
예제 #2
0
def test_searching_mt_warning(index_dir):
    """Test searching for mt warning data."""
    with open(os.path.abspath('caterpillar/test_resources/mt_warning_utf8.txt'), 'rbU') as f:
        data = f.read()
        analyser = TestAnalyser()
        config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser)))
        with IndexWriter(index_dir, config) as writer:
            writer.add_document(text=data, frame_size=2)

        # Merge bigrams
        with IndexReader(index_dir) as reader:
            bigrams = find_bi_gram_words(reader.get_frames())
        with IndexWriter(index_dir) as writer:
            writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams])

        with IndexReader(index_dir) as reader:
            searcher = reader.searcher()
            assert searcher.count(QSQ('1770')) == 2
            assert searcher.count(QSQ('1,900')) == 1
            assert searcher.count(QSQ('4.4')) == 1
            assert searcher.count(QSQ('*')) == reader.get_frame_count()
예제 #3
0
def test_searching_alice(index_dir):
    """Test basic searching functions for Alice."""
    with open(os.path.abspath('caterpillar/test_resources/alice.txt'), 'rbU') as f:
        f.seek(0)
        data = f.read()
        analyser = TestAnalyser()
        config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser)))
        with IndexWriter(index_dir, config) as writer:
            writer.add_document(text=data, frame_size=2)
            writer.fold_term_case()

        # Merge bigrams
        with IndexReader(index_dir) as reader:
            bigrams = find_bi_gram_words(reader.get_frames())
        with IndexWriter(index_dir) as writer:
            writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams])

        with IndexReader(index_dir) as reader:
            searcher = reader.searcher()
            assert searcher.count(QSQ("King")) == searcher.count(QSQ("K?ng"))
            assert searcher.count(QSQ("Queen or K??g")) == 123 == \
                searcher.count(QSQ("King or Queen"))
            assert searcher.count(QSQ("King AND Queen")) == 4 == \
                searcher.count(MatchAllQuery([QSQ('King'), QSQ('Queen')])) == \
                searcher.count(QSQ('King')) - searcher.count(QSQ('King not Queen'))
            assert searcher.count(QSQ("King NOT Queen")) == 56
            assert searcher.count(QSQ('golden key')) == 6
            assert searcher.count(QSQ('*ing')) == 512
            assert searcher.count(QSQ("Alice and (thought or little)")) == \
                searcher.count(QSQ("Alice and thought or Alice and little")) == 95 == \
                searcher.count(MatchAllQuery([QSQ('Alice'), MatchSomeQuery([QSQ('thought'), QSQ('little')])]))
            assert searcher.count(QSQ("thistermdoesntexist")) == 0
            assert searcher.count(QSQ('Mock Turtle')) == 51
            assert searcher.count(QSQ('*t? R*b??')) == searcher.count(QSQ('White Rabbit'))

            assert "jury" in searcher.search(QSQ("jury"), limit=1)[0].data['text']

            voice_hits = searcher.count(QSQ("voice"))
            assert voice_hits == 46
            misses = 0
            results = searcher.search(QSQ("Alice or voice"), limit=voice_hits)
            for hit in results:
                misses = misses + (1 if "voice" not in hit.frame_terms else 0)
            assert misses == 23
            misses = 0
            results = searcher.search(QSQ("Alice or voice^0.2"), limit=voice_hits)
            for hit in results:
                misses = misses + (1 if "voice" not in hit.frame_terms else 0)
            assert misses == 45
            misses = 0
            results = searcher.search(QSQ("Alice or voice^0.5"), limit=voice_hits)
            for hit in results:
                misses = misses + (1 if "voice" not in hit.frame_terms else 0)
            assert misses == 36
            results = searcher.search(QSQ("Alice or voice^20"), limit=voice_hits)
            for hit in results:
                assert "voice" in hit.frame_terms
            misses = 0
            results = searcher.search(QSQ("Alice or voice"), limit=0)
            for hit in results[-voice_hits:]:
                misses = misses + (1 if "voice" not in hit.frame_terms else 0)
            assert misses == voice_hits
            misses = 0
            results = searcher.search(QSQ("Alice^20 or voice"), limit=0)
            for hit in results[-voice_hits:]:
                misses = misses + (1 if "voice" not in hit.frame_terms else 0)
            assert misses == 15

            results = searcher.search(QSQ("King not (court or evidence)"))
            assert len(results) == 25
            assert len(results.term_weights) == 1
            assert results.num_matches == 53 == searcher.count(MatchAllQuery([QSQ('King')], [QSQ('court or evidence')]))
            for hit in results:
                assert "evidence" not in hit.data['text']
                assert "court" not in hit.data['text']
                assert hit.data['_field'] == 'text'
                for k in results[0].data.iterkeys():
                    assert k in ('_id', '_doc_id', '_field') or not k.startswith('_')

            # Check multiple boostings; this example is totally contrived but a real case could occur when combining
            # different plugin queries.
            results = searcher.search(MatchSomeQuery([QSQ("King"), QSQ("court AND King^1.5")]))
            assert results.term_weights['King'] == 1.5

            with pytest.raises(TypeError):
                # Invalid query format
                searcher.count('hello')
예제 #4
0
# Copyright (c) 2012-2014 Kapiche Limited
# Author: Ryan Stuart <*****@*****.**>
"""Print out the bi_grams present in caterpillar/test_resources/alice.txt in a pretty format."""
import os
import json
import shutil
import tempfile

from caterpillar.processing.index import find_bi_gram_words, IndexWriter, IndexConfig, IndexReader
from caterpillar.processing.schema import Schema, TEXT
from caterpillar.storage.sqlite import SqliteStorage

path = tempfile.mkdtemp()
try:
    index_dir = os.path.join(path, "example")
    with open('caterpillar/test_resources/alice.txt', 'r') as f:
        data = f.read()
        with IndexWriter(index_dir, IndexConfig(SqliteStorage, Schema(text=TEXT))) as writer:
            writer.add_document(text=data, frame_size=2)
        # What are the bigrams?
        with IndexReader(index_dir) as reader:
            bi_grams = find_bi_gram_words(reader.get_frames())
            print json.dumps(bi_grams, indent=4)
finally:
    shutil.rmtree(path)