class DataStore(object): """Class to save/retrieve data from datastore. Uses xapian for indexing the data. """ def __init__(self, data_dir): if not os.path.exists(data_dir): os.mkdir(data_dir) self.index = IndexStore(data_dir) def add(self, url, text, categories=[]): """Add an entry to the datastore.""" with self.index.connect(): self.index.add(url, text, categories) def search(self, text, category=None): """Search for matching documents for a term among the indexed documents. """ res = self.index.search(text, category) return (self.index.get_url(r) for r in res) def exists(self, url): """Check whether a URL has already been indexed.""" self.index.exists(url) def get_categories(self): """Get a list of the categories defined in the index.""" return self.index.get_categories() def get_urls(self): """Get a list of the urls in the index.""" return self.index.get_urls() def get_entries(self): """Get a list of the entries in the index.""" docs = self.index.get_documents() return ((d.data['url'][0], d.data['category']) for d in docs)
class TestIndex(unittest.TestCase): def setUp(self): self.data_dir = tempfile.mkdtemp() self.index = IndexStore(self.data_dir) self.index._connect() def tearDown(self): self.index._disconnect() shutil.rmtree(self.data_dir) def test_add(self): self.assertEqual(self.index.add("url", "Body of text"), '0') self.assertEqual(self.index.add("url2", "Body of text", date='2010-01-10'), '1') self.assertEqual(self.index.add("url3", "Lots of text", categories=['general', ]), '2') def test_search(self): self.index.add("url", "Body of text") self.index._conn.flush() res = self.index.search("Body") self.assertEquals(len(res), 1) self.assertEquals(res[0].rank, 0) self.assertEquals(res[0].id, '0') def test_search_two(self): self.index.add("url", "Body of text") self.index.add("url2", "Lot of text", date='2010-02-10') self.index._conn.flush() res = self.index.search("text") self.assertEquals(len(res), 2) self.assertEquals(res[0].id, '0') self.assertEquals(res[1].id, '1') def test_search_category(self): self.index.add("url", "Body of text") self.index.add("url2", "Example text general") self.index.add("url3", "Lots of text", categories=['general', ]) self.index._conn.flush() res = self.index.search('lots') self.assertEquals(len(res), 1) res = self.index.search('lots', category='general') self.assertEquals(len(res), 1) res = self.index.search('nothing', category='general') self.assertEquals(len(res), 0) def test_add_multiple_categories(self): self.index.add("url", "Lots of text and text", categories=['general', 'example']) self.index._conn.flush() res = self.index.search('lots', category='general') self.assertEquals(len(res), 1) res = self.index.search('lots', category='example') self.assertEquals(len(res), 1) def test_get_categories(self): self.index.add("url", "Lots of text and text", categories=['general', 'example']) self.index.add("url2", "Example text", categories=['example']) self.index.add("url3", "Lorem ipsum", categories=['latin', ]) self.index._conn.flush() self.assertEqual( [c for c in self.index.get_categories()], ['example', 'general', 'latin'] ) def test_get_url_by_id(self): self.index.add("url", "Lots of text and text", categories=['general', 'example']) self.index.add("url2", "Example text", categories=['example']) self.index._conn.flush() res = self.index.search('example') self.assertEqual(self.index.get_url(res[0]), "url2") def test_exists(self): self.index.add("url", "Lots of text and text", categories=['general', 'example']) self.index._conn.flush() self.assertTrue(self.index.exists('url')) self.assertFalse(self.index.exists('url2')) def test_get_urls(self): self.index.add("url", "Lots of text and text", categories=['general', 'example']) self.index.add("url2", "Example text", categories=['example']) self.index._conn.flush() res = self.index.get_urls() self.assertEqual( [u for u in self.index.get_urls()], ['url', 'url2'] )