def test_deletion(self): s = fields.Schema(key=fields.ID, name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green purple?") w.add_document(key=u"B", name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.add_document(key=u"C", name=u"One two", value=u"Three four five.") w.commit() count = ix.delete_by_term("key", u"B") self.assertEqual(count, 1) ix.commit() self.assertEqual(ix.doc_count_all(), 3) self.assertEqual(ix.doc_count(), 2) ix.optimize() self.assertEqual(ix.doc_count(), 2) tr = ix.term_reader() self.assertEqual(list(tr.lexicon("name")), ["brown", "one", "two", "yellow"])
def _create_index(self): s = fields.Schema(f1 = fields.KEYWORD(stored = True), f2 = fields.KEYWORD, f3 = fields.KEYWORD) st = store.RamStorage() ix = index.Index(st, s, create = True) return ix
def test_creation(self): s = fields.Schema() s.add("content", fields.TEXT(phrase=True)) s.add("title", fields.TEXT(stored=True)) s.add("path", fields.ID(stored=True)) s.add("tags", fields.KEYWORD(stored=True)) s.add("quick", fields.NGRAM) s.add("note", fields.STORED) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(title=u"First", content=u"This is the first document", path=u"/a", tags=u"first second third", quick=u"First document", note=u"This is the first document") w.start_document() w.add_field("content", u"Let's try this again") w.add_field("title", u"Second") w.add_field("path", u"/b") w.add_field("tags", u"Uno Dos Tres") w.add_field("quick", u"Second document") w.add_field("note", u"This is the second document") w.end_document() w.commit()
def test_frequency_text(self): s = fields.Schema(content=fields.KEYWORD) st = store.RamStorage() ix = index.Index(st, s, create=True) w = ix.writer() w.add_document(content=u"alfa bravo charlie delta echo") w.add_document(content=u"bravo bravo bravo bravo charlie delta delta") w.add_document(content=u"delta echo foxtrot") w.commit() tr = ix.term_reader() self.assertEqual(tr.doc_frequency("content", u"bravo"), 2) self.assertEqual(tr.frequency("content", u"bravo"), 5) self.assertEqual(tr.doc_frequency("content", u"echo"), 2) self.assertEqual(tr.frequency("content", u"echo"), 2) self.assertEqual(tr.doc_frequency("content", u"alfa"), 1) self.assertEqual(tr.frequency("content", u"alfa"), 1) self.assertEqual(tr.doc_frequency("content", u"delta"), 3) self.assertEqual(tr.frequency("content", u"delta"), 4) self.assertEqual(tr.doc_frequency("content", u"foxtrot"), 1) self.assertEqual(tr.frequency("content", u"foxtrot"), 1) self.assertEqual(tr.doc_frequency("content", u"zulu"), 0) self.assertEqual(tr.frequency("content", u"zulu"), 0) self.assertEqual(list(tr), [(0, u"alfa", 1, 1), (0, u"bravo", 2, 5), (0, u"charlie", 2, 2), (0, u"delta", 3, 4), (0, u"echo", 2, 2), (0, u"foxtrot", 1, 1)])
def test_frequency_keyword(self): s = fields.Schema(content=fields.KEYWORD) st = store.RamStorage() ix = index.Index(st, s, create=True) w = ix.writer() w.add_document(content=u"A B C D E") w.add_document(content=u"B B B B C D D") w.add_document(content=u"D E F") w.commit() tr = ix.term_reader() self.assertEqual(tr.doc_frequency("content", u"B"), 2) self.assertEqual(tr.frequency("content", u"B"), 5) self.assertEqual(tr.doc_frequency("content", u"E"), 2) self.assertEqual(tr.frequency("content", u"E"), 2) self.assertEqual(tr.doc_frequency("content", u"A"), 1) self.assertEqual(tr.frequency("content", u"A"), 1) self.assertEqual(tr.doc_frequency("content", u"D"), 3) self.assertEqual(tr.frequency("content", u"D"), 4) self.assertEqual(tr.doc_frequency("content", u"F"), 1) self.assertEqual(tr.frequency("content", u"F"), 1) self.assertEqual(tr.doc_frequency("content", u"Z"), 0) self.assertEqual(tr.frequency("content", u"Z"), 0) self.assertEqual(list(tr), [(0, u"A", 1, 1), (0, u"B", 2, 5), (0, u"C", 2, 2), (0, u"D", 3, 4), (0, u"E", 2, 2), (0, u"F", 1, 1)])
def test_merged_lengths(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(f1=u"A B C", f2=u"X") w.add_document(f1=u"B C D E", f2=u"Y Z") w.commit() w = writing.IndexWriter(ix) w.add_document(f1=u"A", f2=u"B C D E X Y") w.add_document(f1=u"B C", f2=u"X") w.commit(writing.NO_MERGE) w = writing.IndexWriter(ix) w.add_document(f1=u"A B X Y Z", f2=u"B C") w.add_document(f1=u"Y X", f2=u"A B") w.commit(writing.NO_MERGE) dr = ix.doc_reader() self.assertEqual(dr[0]["f1"], u"A B C") self.assertEqual(dr.doc_field_length(0, "f1"), 3) self.assertEqual(dr.doc_field_length(2, "f2"), 6) self.assertEqual(dr.doc_field_length(4, "f1"), 5)
def update_index(sender, instance, created, **kwargs): storage = store.FileStorage(settings.WHOOSH_INDEX) ix = index.Index(storage, schema=WHOOSH_SCHEMA) writer = ix.writer() if created: writer.add_document(title=unicode(instance), content=instance.content, url=unicode(instance.get_absolute_url())) writer.commit() else: writer.update_document(title=unicode(instance), content=instance.content, url=unicode(instance.get_absolute_url())) writer.commit()
def test_integrity(self): s = fields.Schema(name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(name=u"Yellow brown", value=u"Blue red green purple?") w.add_document(name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.commit() w = writing.IndexWriter(ix) w.add_document(name=u"One two", value=u"Three four five.") w.commit() tr = ix.term_reader() self.assertEqual(ix.doc_count_all(), 3) self.assertEqual(list(tr.lexicon("name")), ["alpha", "beta", "brown", "one", "two", "yellow"])
def test_vector_postings(self): s = fields.Schema(id=fields.ID(stored=True, unique=True), content=fields.TEXT(vector=fields.Positions(analyzer=analysis.StandardAnalyzer()))) st = store.RamStorage() ix = index.Index(st, s, create = True) writer = ix.writer() writer.add_document(id=u'1', content=u'the quick brown fox jumped over the lazy dogs') writer.commit() dr = ix.doc_reader() terms = list(dr.vector_as(0, 0, "weight")) self.assertEqual(terms, [(u'brown', 1.0), (u'dogs', 1.0), (u'fox', 1.0), (u'jumped', 1.0), (u'lazy', 1.0), (u'over', 1.0), (u'quick', 1.0), ])
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = store.RamStorage() idx = index.Index(storage, schema, create=True) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) searcher = idx.searcher() result = searcher.search(parser.parse(u'baseball')) self.assertEqual(len(result), 1)
def test_score_retrieval(self): schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(stored=True)) storage = store.RamStorage() ix = index.Index(storage, schema, create=True) writer = ix.writer() writer.add_document( title=u"Miss Mary", content=u"Mary had a little white lamb its fleece was white as snow" ) writer.add_document( title=u"Snow White", content=u"Snow white lived in the forrest with seven dwarfs") writer.commit() searcher = ix.searcher() results = searcher.search(Term("content", "white")) self.assertEqual(len(results), 2) self.assertEqual(results[0]['title'], u"Miss Mary") self.assertEqual(results[1]['title'], u"Snow White") self.assertNotEqual(results.score(0), None) self.assertNotEqual(results.score(0), 0) self.assertNotEqual(results.score(0), 1)
def setUp(self): s = fields.Schema(key=fields.ID(stored=True), name=fields.TEXT, value=fields.TEXT) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(key=u"A", name=u"Yellow brown", value=u"Blue red green render purple?") w.add_document(key=u"B", name=u"Alpha beta", value=u"Gamma delta epsilon omega.") w.add_document(key=u"C", name=u"One two", value=u"Three rendered four five.") w.add_document(key=u"D", name=u"Quick went", value=u"Every red town.") w.add_document(key=u"E", name=u"Yellow uptown", value=u"Interest rendering outer photo!") w.commit() self.ix = ix
def test_lengths_ram(self): s = fields.Schema(f1=fields.KEYWORD(stored=True, scorable=True), f2=fields.KEYWORD(stored=True, scorable=True)) st = store.RamStorage() ix = index.Index(st, s, create=True) w = writing.IndexWriter(ix) w.add_document(f1=u"A B C D E", f2=u"X Y Z") w.add_document(f1=u"B B B B C D D Q", f2=u"Q R S T") w.add_document(f1=u"D E F", f2=u"U V A B C D E") w.commit() dr = ix.doc_reader() ls1 = [dr.doc_field_length(i, "f1") for i in xrange(0, 3)] ls2 = [dr.doc_field_length(i, "f2") for i in xrange(0, 3)] self.assertEqual(dr[0]["f1"], "A B C D E") self.assertEqual(dr.doc_field_length(0, "f1"), 5) self.assertEqual(dr.doc_field_length(1, "f1"), 8) self.assertEqual(dr.doc_field_length(2, "f1"), 3) self.assertEqual(dr.doc_field_length(0, "f2"), 3) self.assertEqual(dr.doc_field_length(1, "f2"), 4) self.assertEqual(dr.doc_field_length(2, "f2"), 7) self.assertEqual(ix.field_length("f1"), 16) self.assertEqual(ix.field_length("f2"), 14)
def make_index(self, dirname, schema): if not exists(dirname): mkdir(dirname) st = store.FileStorage(dirname) ix = index.Index(st, schema, create=True) return ix