def test_searching_alice_simple(index_dir): """Test searching for Alice with the simple scorer.""" with open(os.path.abspath('caterpillar/test_resources/alice.txt'), 'rbU') as f: f.seek(0) data = f.read() analyser = TestAnalyser() config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser))) with IndexWriter(index_dir, config) as writer: writer.add_document(text=data, frame_size=2) # Merge bigrams with IndexReader(index_dir) as reader: bigrams = find_bi_gram_words(reader.get_frames()) with IndexWriter(index_dir) as writer: writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams]) with IndexReader(index_dir) as reader: searcher = reader.searcher(scorer_cls=SimpleScorer) results = searcher.search(QSQ('Alice or Caterpillar')) assert results[0].score == 2
def test_searching_mt_warning(index_dir): """Test searching for mt warning data.""" with open(os.path.abspath('caterpillar/test_resources/mt_warning_utf8.txt'), 'rbU') as f: data = f.read() analyser = TestAnalyser() config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser))) with IndexWriter(index_dir, config) as writer: writer.add_document(text=data, frame_size=2) # Merge bigrams with IndexReader(index_dir) as reader: bigrams = find_bi_gram_words(reader.get_frames()) with IndexWriter(index_dir) as writer: writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams]) with IndexReader(index_dir) as reader: searcher = reader.searcher() assert searcher.count(QSQ('1770')) == 2 assert searcher.count(QSQ('1,900')) == 1 assert searcher.count(QSQ('4.4')) == 1 assert searcher.count(QSQ('*')) == reader.get_frame_count()
def test_searching_alice(index_dir): """Test basic searching functions for Alice.""" with open(os.path.abspath('caterpillar/test_resources/alice.txt'), 'rbU') as f: f.seek(0) data = f.read() analyser = TestAnalyser() config = IndexConfig(SqliteStorage, schema=schema.Schema(text=schema.TEXT(analyser=analyser))) with IndexWriter(index_dir, config) as writer: writer.add_document(text=data, frame_size=2) writer.fold_term_case() # Merge bigrams with IndexReader(index_dir) as reader: bigrams = find_bi_gram_words(reader.get_frames()) with IndexWriter(index_dir) as writer: writer.merge_terms(merges=[((bigram.split(' ')[0], bigram.split(' ')[1]), bigram) for bigram in bigrams]) with IndexReader(index_dir) as reader: searcher = reader.searcher() assert searcher.count(QSQ("King")) == searcher.count(QSQ("K?ng")) assert searcher.count(QSQ("Queen or K??g")) == 123 == \ searcher.count(QSQ("King or Queen")) assert searcher.count(QSQ("King AND Queen")) == 4 == \ searcher.count(MatchAllQuery([QSQ('King'), QSQ('Queen')])) == \ searcher.count(QSQ('King')) - searcher.count(QSQ('King not Queen')) assert searcher.count(QSQ("King NOT Queen")) == 56 assert searcher.count(QSQ('golden key')) == 6 assert searcher.count(QSQ('*ing')) == 512 assert searcher.count(QSQ("Alice and (thought or little)")) == \ searcher.count(QSQ("Alice and thought or Alice and little")) == 95 == \ searcher.count(MatchAllQuery([QSQ('Alice'), MatchSomeQuery([QSQ('thought'), QSQ('little')])])) assert searcher.count(QSQ("thistermdoesntexist")) == 0 assert searcher.count(QSQ('Mock Turtle')) == 51 assert searcher.count(QSQ('*t? R*b??')) == searcher.count(QSQ('White Rabbit')) assert "jury" in searcher.search(QSQ("jury"), limit=1)[0].data['text'] voice_hits = searcher.count(QSQ("voice")) assert voice_hits == 46 misses = 0 results = searcher.search(QSQ("Alice or voice"), limit=voice_hits) for hit in results: misses = misses + (1 if "voice" not in hit.frame_terms else 0) assert misses == 23 misses = 0 results = searcher.search(QSQ("Alice or voice^0.2"), limit=voice_hits) for hit in results: misses = misses + (1 if "voice" not in hit.frame_terms else 0) assert misses == 45 misses = 0 results = searcher.search(QSQ("Alice or voice^0.5"), limit=voice_hits) for hit in results: misses = misses + (1 if "voice" not in hit.frame_terms else 0) assert misses == 36 results = searcher.search(QSQ("Alice or voice^20"), limit=voice_hits) for hit in results: assert "voice" in hit.frame_terms misses = 0 results = searcher.search(QSQ("Alice or voice"), limit=0) for hit in results[-voice_hits:]: misses = misses + (1 if "voice" not in hit.frame_terms else 0) assert misses == voice_hits misses = 0 results = searcher.search(QSQ("Alice^20 or voice"), limit=0) for hit in results[-voice_hits:]: misses = misses + (1 if "voice" not in hit.frame_terms else 0) assert misses == 15 results = searcher.search(QSQ("King not (court or evidence)")) assert len(results) == 25 assert len(results.term_weights) == 1 assert results.num_matches == 53 == searcher.count(MatchAllQuery([QSQ('King')], [QSQ('court or evidence')])) for hit in results: assert "evidence" not in hit.data['text'] assert "court" not in hit.data['text'] assert hit.data['_field'] == 'text' for k in results[0].data.iterkeys(): assert k in ('_id', '_doc_id', '_field') or not k.startswith('_') # Check multiple boostings; this example is totally contrived but a real case could occur when combining # different plugin queries. results = searcher.search(MatchSomeQuery([QSQ("King"), QSQ("court AND King^1.5")])) assert results.term_weights['King'] == 1.5 with pytest.raises(TypeError): # Invalid query format searcher.count('hello')
# Copyright (c) 2012-2014 Kapiche Limited # Author: Ryan Stuart <*****@*****.**> """Print out the bi_grams present in caterpillar/test_resources/alice.txt in a pretty format.""" import os import json import shutil import tempfile from caterpillar.processing.index import find_bi_gram_words, IndexWriter, IndexConfig, IndexReader from caterpillar.processing.schema import Schema, TEXT from caterpillar.storage.sqlite import SqliteStorage path = tempfile.mkdtemp() try: index_dir = os.path.join(path, "example") with open('caterpillar/test_resources/alice.txt', 'r') as f: data = f.read() with IndexWriter(index_dir, IndexConfig(SqliteStorage, Schema(text=TEXT))) as writer: writer.add_document(text=data, frame_size=2) # What are the bigrams? with IndexReader(index_dir) as reader: bi_grams = find_bi_gram_words(reader.get_frames()) print json.dumps(bi_grams, indent=4) finally: shutil.rmtree(path)