def sample_sim_index(): # Create an in-memory index and query it print() print("Creating in-memory index of university homepages") sim_index = MemorySimIndex() sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') print("Postings list for 'university':") pprint(sim_index.postings_list('university')) print("Pages containing terms 'university' and 'california'") pprint(list(sim_index.docnames_with_terms('university', 'california'))) # Issue some similarity queries print() print("Similarity search for query 'stanford university' (simple scorer)") sim_index.set_query_scorer('simple_count') pprint(list(sim_index.query("stanford university"))) print() print("Similarity search for query 'stanford university' (tf.idf scorer)") sim_index.set_query_scorer('tfidf') pprint(list(sim_index.query("stanford university"))) # Save the index to disk, then load it back in print() print("Saving index to disk") with open("myindex.idx", "w") as index_file: sim_index.save(index_file) print() print("Loading index from disk") with open("myindex.idx", "r") as index_file: sim_index2 = MemorySimIndex.load(index_file) print() print("Pages containing terms 'university' and 'california' in loaded index") pprint(list(sim_index2.docnames_with_terms('university', 'california')))
def sample_sim_index(): # Create an in-memory index and query it print() print("Creating in-memory index of university homepages") sim_index = MemorySimIndex() sim_index.index_urls('http://www.stanford.edu/', 'http://www.berkeley.edu', 'http://www.ucla.edu', 'http://www.mit.edu') print("Postings list for 'university':") pprint(sim_index.postings_list('university')) print("Pages containing terms 'university' and 'california'") pprint(list(sim_index.docnames_with_terms('university', 'california'))) # Issue some similarity queries print() print("Similarity search for query 'stanford university' (simple scorer)") sim_index.set_query_scorer('simple_count') pprint(list(sim_index.query("stanford university"))) print() print("Similarity search for query 'stanford university' (tf.idf scorer)") sim_index.set_query_scorer('tfidf') pprint(list(sim_index.query("stanford university"))) # Save the index to disk, then load it back in print() print("Saving index to disk") with open("myindex.idx", "w") as index_file: sim_index.save(index_file) print() print("Loading index from disk") with open("myindex.idx", "r") as index_file: sim_index2 = MemorySimIndex.load(index_file) print() print( "Pages containing terms 'university' and 'california' in loaded index") pprint(list(sim_index2.docnames_with_terms('university', 'california')))
class MemorySimIndexTest(SimIndexTest, unittest.TestCase): ''' All tests hitting the SimIndex interface are in the parent class, SimIndexTest Tests for api's not in parent class are tested separately here. This is so we can reuse test code across all implementations of SimIndex. ''' def setUp(self): print("MemorySimIndexTest") self.sim_index = MemorySimIndex() super(MemorySimIndexTest, self).setUp() def tearDown(self): pass def test_save_load(self): '''Test save()/load() functionality''' with io.BytesIO() as output: self.sim_index.save(output) output.seek(0) loaded_sim_index = MemorySimIndex.load(output) self.sim_index = loaded_sim_index self.test_query_simple_scorer() # make sure test_query() still works