def test_union(): s1 = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8]) s2 = matching.ListMatcher([2, 4, 8, 10, 20, 30]) s3 = matching.ListMatcher([10, 100, 200]) target = [1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 30, 100, 200] um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) assert target == list(um.all_ids())
def test_listmatcher(): ids = [1, 2, 5, 9, 10] lm = matching.ListMatcher(ids) ls = [] while lm.is_active(): ls.append((lm.id(), lm.score())) lm.next() assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)] lm = matching.ListMatcher(ids) assert list(lm.all_ids()) == ids lm = matching.ListMatcher(ids, position=3) ls = [] while lm.is_active(): ls.append(lm.id()) lm.next() assert ls == [9, 10] lm = matching.ListMatcher(ids) for _ in xrange(3): lm.next() lm = lm.copy() ls = [] while lm.is_active(): ls.append(lm.id()) lm.next() assert ls == [9, 10]
def test_replacements(): sc = scoring.WeightScorer(0.25) a = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) b = matching.ListMatcher([1, 2, 3], [0.25, 0.25, 0.25], scorer=sc) um = matching.UnionMatcher(a, b) a2 = a.replace(0.5) assert_equal(a2.__class__, matching.NullMatcherClass) um2 = um.replace(0.5) assert_equal(um2.__class__, matching.IntersectionMatcher) um2 = um.replace(0.6) assert_equal(um2.__class__, matching.NullMatcherClass) wm = matching.WrappingMatcher(um, boost=2.0) wm = wm.replace(0.5) assert_equal(wm.__class__, matching.WrappingMatcher) assert_equal(wm.boost, 2.0) assert_equal(wm.child.__class__, matching.IntersectionMatcher) ls1 = matching.ListMatcher([1, 2, 3], [0.1, 0.1, 0.1], scorer=scoring.WeightScorer(0.1)) ls2 = matching.ListMatcher([1, 2, 3], [0.2, 0.2, 0.2], scorer=scoring.WeightScorer(0.2)) ls3 = matching.ListMatcher([1, 2, 3], [0.3, 0.3, 0.3], scorer=scoring.WeightScorer(0.3)) mm = matching.MultiMatcher([ls1, ls2, ls3], [0, 4, 8]) mm = mm.replace(0.25) assert_equal(mm.current, 2) dm = matching.DisjunctionMaxMatcher(ls1, ls2) dm = dm.replace(0.15) assert dm is ls2
def test_arrayunion(): l1 = matching.ListMatcher([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) l2 = matching.ListMatcher([100, 200, 300, 400, 500, 600]) aum = matching.ArrayUnionMatcher([l1, l2], 600, partsize=5) assert aum.id() == 10 aum.skip_to(45) assert aum.id() == 50 aum.skip_to(550) assert aum.id() == 600
def test_arrayunion2(): l1 = matching.ListMatcher([1, 2]) l2 = matching.ListMatcher([1, 2, 10, 20]) l3 = matching.ListMatcher([1, 5, 10, 50]) aum = matching.ArrayUnionMatcher([l1, l2, l3], 51, partsize=2) assert aum.id() == 1 assert not l1.is_active() aum.skip_to(50) assert aum.id() == 50
def test_union_scores(): s1 = matching.ListMatcher([1, 2, 3]) s2 = matching.ListMatcher([2, 4, 8]) s3 = matching.ListMatcher([2, 3, 8]) target = [(1, 1.0), (2, 3.0), (3, 2.0), (4, 1.0), (8, 2.0)] um = matching.UnionMatcher(s1, matching.UnionMatcher(s2, s3)) result = [] while um.is_active(): result.append((um.id(), um.score())) um.next() assert target == result
def test_wrapper(): wm = matching.WrappingMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), boost=2.0) ls = [] while wm.is_active(): ls.append((wm.id(), wm.score())) wm.next() assert ls == [(1, 2.0), (2, 2.0), (5, 2.0), (9, 2.0), (10, 2.0)] ids = [1, 2, 5, 9, 10] wm = matching.WrappingMatcher(matching.ListMatcher(ids), boost=2.0) assert list(wm.all_ids()) == ids
def test_andnot(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) ls = [] while anm.is_active(): ls.append((anm.id(), anm.score())) anm.next() assert ls == [(1, 1.0), (10, 1.0), (90, 1.0)] echo_lm = matching.ListMatcher([0, 1, 2, 3, 4]) bravo_lm = matching.ListMatcher([0, 1]) anm = matching.AndNotMatcher(echo_lm, bravo_lm) assert list(anm.all_ids()) == [2, 3, 4] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) assert list(anm.all_ids()) == [1, 10, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) anm = matching.AndNotMatcher(lm1, lm2) anm.next() anm.next() anm = anm.copy() ls = [] while anm.is_active(): ls.append(anm.id()) anm.next() assert ls == [90]
def matcher(self, searcher, weighting=None): fieldname = self.fieldname reader = searcher.reader() if fieldname in (None, "", "*"): # This takes into account deletions doclist = array("I", reader.all_doc_ids()) elif (reader.supports_caches() and reader.fieldcache_available(fieldname)): # If the reader has a field cache, use it to quickly get the list # of documents that have a value for this field fc = reader.fieldcache(self.fieldname) doclist = array("I", (docnum for docnum, ordinal in fc.ords() if ordinal != 0)) else: # This is a hacky hack, but just create an in-memory set of all the # document numbers of every term in the field. This is SLOOOW for # large indexes doclist = set() for text in searcher.lexicon(fieldname): pr = searcher.postings(fieldname, text) doclist.update(pr.all_ids()) doclist = sorted(doclist) return matching.ListMatcher(doclist, all_weights=self.boost)
def matcher(self, searcher, weighting=None): m = self.child.matcher(searcher) if isinstance(m, matching.NullMatcherClass): return m else: ids = array("I", m.all_ids()) return matching.ListMatcher(ids, all_weights=self.score, term=m.term())
def test_inverse(): s = matching.ListMatcher([1, 5, 10, 11, 13]) inv = matching.InverseMatcher(s, 15) ids = [] while inv.is_active(): ids.append(inv.id()) inv.next() assert ids == [0, 2, 3, 4, 6, 7, 8, 9, 12, 14]
def test_filter(): lm = lambda: matching.ListMatcher(list(range(2, 10))) fm = matching.FilterMatcher(lm(), frozenset([3, 9])) assert list(fm.all_ids()) == [3, 9] fm = matching.FilterMatcher(lm(), frozenset([1, 5, 9, 13])) assert list(fm.all_ids()) == [5, 9]
def test_random_andnot(): testcount = 100 rangesize = 100 rng = list(range(rangesize)) for _ in xrange(testcount): negs = sorted(sample(rng, randint(0, rangesize - 1))) negset = frozenset(negs) matched = [n for n in rng if n not in negset] pos = matching.ListMatcher(rng) neg = matching.ListMatcher(negs) anm = matching.AndNotMatcher(pos, neg) ids = list(anm.all_ids()) assert ids == matched
def test_listmatcher_skip_to_quality_identical_scores(): ids = [1, 2, 5, 9, 10] lm = matching.ListMatcher(ids, scorer=WeightScorer(1.0)) lm.skip_to_quality(0.3) ls = [] while lm.is_active(): ls.append((lm.id(), lm.score())) lm.next() assert ls == [(1, 1.0), (2, 1.0), (5, 1.0), (9, 1.0), (10, 1.0)]
def test_inverse_skip(): s = matching.ListMatcher([1, 5, 10, 11, 13]) inv = matching.InverseMatcher(s, 15) inv.skip_to(8) ids = [] while inv.is_active(): ids.append(inv.id()) inv.next() assert ids == [8, 9, 12, 14]
def test_exclude(): em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) assert list(em.all_ids()) == [1, 5, 10] em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) assert list(em.all_ids()) == [1, 5, 10] em = matching.FilterMatcher(matching.ListMatcher([1, 2, 5, 9, 10]), frozenset([2, 9]), exclude=True) em.next() em.next() em = em.copy() ls = [] while em.is_active(): ls.append(em.id()) em.next() assert ls == [10]
def matcher(self, searcher, context=None): from whoosh.searching import SearchContext context = context or SearchContext() m = self.child.matcher(searcher, context) if context.needs_current or isinstance(m, matching.NullMatcherClass): return m else: ids = array("I", m.all_ids()) return matching.ListMatcher(ids, all_weights=self.score, term=m.term())
def test_empty_andnot(): pos = matching.NullMatcher() neg = matching.NullMatcher() anm = matching.AndNotMatcher(pos, neg) assert not anm.is_active() assert not list(anm.all_ids()) pos = matching.ListMatcher([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) neg = matching.NullMatcher() ans = matching.AndNotMatcher(pos, neg) ids = list(ans.all_ids()) assert ids == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
def test_simple_intersection(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) ls = [] while im.is_active(): ls.append((im.id(), im.score())) im.next() assert ls == [(4, 2.0), (20, 2.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) assert list(im.all_ids()) == [4, 20] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) im = matching.IntersectionMatcher(lm1, lm2) im.next() im.next() im = im.copy() ls = [] while im.is_active(): ls.append(im.id()) im.next() assert not ls
def test_require(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) ls = [] while rm.is_active(): ls.append((rm.id(), rm.score())) rm.next() assert ls == [(4, 1.0), (20, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) assert list(rm.all_ids()) == [4, 20] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) rm = matching.RequireMatcher(lm1, lm2) rm.next() rm.next() rm = rm.copy() ls = [] while rm.is_active(): ls.append(rm.id()) rm.next() assert not ls
def test_andmaybe(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) ls = [] while amm.is_active(): ls.append((amm.id(), amm.score())) amm.next() assert ls == [(1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) assert list(amm.all_ids()) == [1, 4, 10, 20, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) amm = matching.AndMaybeMatcher(lm1, lm2) amm.next() amm.next() amm = amm.copy() ls = [] while amm.is_active(): ls.append(amm.id()) amm.next() assert ls == [10, 20, 90]
def test_simple_union(): lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) ls = [] while um.is_active(): ls.append((um.id(), um.score())) um.next() assert ls == [(0, 1.0), (1, 1.0), (4, 2.0), (10, 1.0), (20, 2.0), (90, 1.0)] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) assert list(um.all_ids()) == [0, 1, 4, 10, 20, 90] lm1 = matching.ListMatcher([1, 4, 10, 20, 90]) lm2 = matching.ListMatcher([0, 4, 20]) um = matching.UnionMatcher(lm1, lm2) um.next() um.next() um = um.copy() ls = [] while um.is_active(): ls.append(um.id()) um.next() assert ls == [4, 10, 20, 90]
def create_matchers(): id1 = [i for i in range(1000)] id2 = [i + 1 for i in range(1000)] id3 = [i * 2 + i % 5 for i in range(1000)] id4 = [i * i for i in range(1000)] id5 = [1001 - i for i in range(1000)] id6 = [i * 3 // 2 for i in range(1000)] vl1 = [0.1 for i in range(1000)] vl2 = [0.2 for i in range(1000)] vl3 = [0.3 for i in range(1000)] vl4 = [0.4 for i in range(1000)] vl5 = [0.5 for i in range(1000)] vl6 = [0.6 for i in range(1000)] sc1 = scoring.WeightScorer(0.15) sc2 = scoring.WeightScorer(0.25) sc3 = scoring.WeightScorer(0.35) sc4 = scoring.WeightScorer(0.45) sc5 = scoring.WeightScorer(0.55) sc6 = scoring.WeightScorer(0.65) ls1 = matching.ListMatcher(id1, vl1, sc1) ls2 = matching.ListMatcher(id2, vl2, sc2) ls3 = matching.ListMatcher(id3, vl3, sc3) ls4 = matching.ListMatcher(id4, vl4, sc4) ls5 = matching.ListMatcher(id5, vl5, sc5) ls6 = matching.ListMatcher(id6, vl6, sc6) um1 = matching.UnionMatcher(ls1, ls2) um2 = matching.UnionMatcher(ls3, ls4) um3 = matching.UnionMatcher(ls5, ls6) inv = matching.InverseMatcher(um3, 15) mm = matching.MultiMatcher([um1, um2, inv], [0, 9, 18]) return mm
def matcher(self, searcher, weighting=None): fieldname = self.fieldname constantscore = self.constantscore reader = searcher.reader() qs = [Term(fieldname, word) for word in self._words(reader)] if not qs: return matching.NullMatcher() if len(qs) == 1: # If there's only one term, just use it q = qs[0] elif constantscore or len(qs) > self.TOO_MANY_CLAUSES: # If there's so many clauses that an Or search would take forever, # trade memory for time and just find all the matching docs serve # them up as one or more ListMatchers fmt = searcher.schema[fieldname].format doc_to_values = defaultdict(list) doc_to_weights = defaultdict(float) for q in qs: m = q.matcher(searcher) while m.is_active(): docnum = m.id() doc_to_values[docnum].append(m.value()) if not constantscore: doc_to_weights[docnum] += m.weight() m.next() docnums = sorted(doc_to_values.keys()) # This is a list of lists of value strings -- ListMatcher will # actually do the work of combining multiple values if the user # asks for them values = [doc_to_values[docnum] for docnum in docnums] kwargs = {"values": values, "format": fmt} if constantscore: kwargs["all_weights"] = self.boost else: kwargs["weights"] = [ doc_to_weights[docnum] for docnum in docnums ] return matching.ListMatcher(docnums, **kwargs) else: # The default case: Or the terms together from whoosh.query import Or q = Or(qs) return q.matcher(searcher, weighting=weighting)
def test_random_union(): testcount = 100 rangelimits = (2, 10) clauselimits = (2, 10) vals = list(range(100)) for _ in xrange(testcount): target = set() matchers = [] for _ in xrange(randint(*clauselimits)): nums = sample(vals, randint(*rangelimits)) target = target.union(nums) matchers.append(matching.ListMatcher(sorted(nums))) target = sorted(target) um = make_binary_tree(matching.UnionMatcher, matchers) assert list(um.all_ids()) == target
def matcher(self, searcher, context=None): fieldname = self.fieldname reader = searcher.reader() if fieldname in (None, "", "*"): # This takes into account deletions doclist = array("I", reader.all_doc_ids()) else: # This is a hacky hack, but just create an in-memory set of all the # document numbers of every term in the field. This is SLOOOW for # large indexes doclist = set() for text in searcher.lexicon(fieldname): pr = searcher.postings(fieldname, text) doclist.update(pr.all_ids()) doclist = sorted(doclist) return matching.ListMatcher(doclist, all_weights=self.boost)