def sample_sim_index_collection(): # SimIndexCollection print() print( "SimIndexCollection: build a collection, index some urls, and query it" ) indexes = (MemorySimIndex(), MemorySimIndex()) index_coll = SimIndexCollection() index_coll.add_shards(*indexes) index_coll.set_query_scorer('tfidf') index_coll.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') pprint(index_coll.query('stanford university'))
def setUp(self): print("SimIndexCollectionTest") self.sim_index = SimIndexCollection() for i in range(2): self.sim_index.add_shards(MemorySimIndex()) super(SimIndexCollectionTest, self).setUp()
def test_save_load(self): '''Test save()/load() functionality''' with io.BytesIO() as output: self.sim_index.save(output) output.seek(0) loaded_sim_index = MemorySimIndex.load(output) self.sim_index = loaded_sim_index self.test_query_simple_scorer() # make sure test_query() still works
class MemorySimIndexTest(SimIndexTest, unittest.TestCase): ''' All tests hitting the SimIndex interface are in the parent class, SimIndexTest Tests for api's not in parent class are tested separately here. This is so we can reuse test code across all implementations of SimIndex. ''' def setUp(self): print("MemorySimIndexTest") self.sim_index = MemorySimIndex() super(MemorySimIndexTest, self).setUp() def tearDown(self): pass def test_save_load(self): '''Test save()/load() functionality''' with io.BytesIO() as output: self.sim_index.save(output) output.seek(0) loaded_sim_index = MemorySimIndex.load(output) self.sim_index = loaded_sim_index self.test_query_simple_scorer() # make sure test_query() still works
def sample_sim_index(): # Create an in-memory index and query it print() print("Creating in-memory index of university homepages") sim_index = MemorySimIndex() sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') print("Postings list for 'university':") pprint(sim_index.postings_list('university')) print("Pages containing terms 'university' and 'california'") pprint(list(sim_index.docnames_with_terms('university', 'california'))) # Issue some similarity queries print() print("Similarity search for query 'stanford university' (simple scorer)") sim_index.set_query_scorer('simple_count') pprint(list(sim_index.query("stanford university"))) print() print("Similarity search for query 'stanford university' (tf.idf scorer)") sim_index.set_query_scorer('tfidf') pprint(list(sim_index.query("stanford university"))) # Save the index to disk, then load it back in print() print("Saving index to disk") with open("myindex.idx", "w") as index_file: sim_index.save(index_file) print() print("Loading index from disk") with open("myindex.idx", "r") as index_file: sim_index2 = MemorySimIndex.load(index_file) print() print("Pages containing terms 'university' and 'california' in loaded index") pprint(list(sim_index2.docnames_with_terms('university', 'california')))
def setUp(self): print("ConcurrentSimIndexTest") self.sim_index = ConcurrentSimIndex(MemorySimIndex()) super(ConcurrentSimIndexTest, self).setUp()
def setUp(self): print("MemorySimIndexTest") self.sim_index = MemorySimIndex() super(MemorySimIndexTest, self).setUp()
def test_config(self): '''Ensure that various config params are properly handled''' ### Test 'lowercase' param def _check_lc(index, golden_results): '''helper that checks index against golden_results''' for (term, golden_docs) in golden_results: self.assertEqual( set(index.docnames_with_terms(term)), golden_docs) self.assertEqual( set([doc for (doc, score) in index.query(term)]), golden_docs) # test data test_docs = (('doc1', 'Hello There'), ('doc2', 'hello there')) # lowercase=True index = MemorySimIndex() index.set_config('lowercase', True) index.index_string_buffers(test_docs) golden_results = (('hello', {'doc1', 'doc2'}), ('Hello', {'doc1', 'doc2'}), ('HELLO', {'doc1', 'doc2'})) _check_lc(index, golden_results) # lowercase=False index = MemorySimIndex() index.set_config('lowercase', False) index.index_string_buffers(test_docs) golden_results = (('hello', {'doc2'}), ('Hello', {'doc1'}), ('HELLO', set())) _check_lc(index, golden_results)
def sample_sim_index(): # Create an in-memory index and query it print() print("Creating in-memory index of university homepages") sim_index = MemorySimIndex() sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') print("Postings list for 'university':") pprint(sim_index.postings_list('university')) print("Pages containing terms 'university' and 'california'") pprint(list(sim_index.docnames_with_terms('university', 'california'))) # Issue some similarity queries print() print("Similarity search for query 'stanford university' (simple scorer)") sim_index.set_query_scorer('simple_count') pprint(list(sim_index.query("stanford university"))) print() print("Similarity search for query 'stanford university' (tf.idf scorer)") sim_index.set_query_scorer('tfidf') pprint(list(sim_index.query("stanford university"))) # Save the index to disk, then load it back in print() print("Saving index to disk") with open("myindex.idx", "w") as index_file: sim_index.save(index_file) print() print("Loading index from disk") with open("myindex.idx", "r") as index_file: sim_index2 = MemorySimIndex.load(index_file) print() print( "Pages containing terms 'university' and 'california' in loaded index") pprint(list(sim_index2.docnames_with_terms('university', 'california')))