def test_fractional_weights(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() # With Positions format schema = fields.Schema(f=fields.TEXT(analyzer=ana)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5]) # Try again with Frequency format schema = fields.Schema(f=fields.TEXT(analyzer=ana, phrase=False)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(f=u("alfa^0.5 bravo^1.5 charlie^2.0 delta^1.5")) w.commit() with ix.searcher() as s: wts = [] for word in s.lexicon("f"): p = s.postings("f", word) wts.append(p.weight()) assert_equal(wts, [0.5, 1.5, 2.0, 1.5])
def test_posboost_postings(): pbs = PositionBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
def test_charboost_postings(): cbs = CharacterBoosts() ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), ("charlie", [(2, 17, 24, 2)])] assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), ("bravo", [(1, 0.1), (3, 0.5)]), ("charlie", [(2, 2)])] assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), ("bravo", [(1, 7, 12), (3, 27, 32)]), ("charlie", [(2, 17, 24)])] assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
def test_delimited_attribute(): ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() results = [(t.text, t.boost) for t in ana(u("image render^2 file^0.5"))] assert_equal(results, [("image", 1.0), ("render", 2.0), ("file", 0.5)])