def test_character_postings(): content = u("alfa bravo charlie bravo alfa alfa") assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]), ("bravo", [(1, 5, 10), (3, 19, 24)]), ("charlie", [(2, 11, 18)])] assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
def test_character_postings(): postings = [] docnum = 0 for _ in xrange(0, 20): docnum += randint(1, 10) posns = [] pos = 0 endchar = 0 for __ in xrange(0, randint(1, 10)): pos += randint(1, 10) startchar = endchar + randint(3, 10) endchar = startchar + randint(3, 10) posns.append((pos, startchar, endchar)) postings.append((docnum, posns)) assert_equal(postings, roundtrip(postings, Characters(), "characters")) as_posns = [(docnum, [pos for pos, sc, ec in posns]) for docnum, posns in postings] assert_equal(as_posns, roundtrip(postings, Characters(), "positions")) as_freq = [(docnum, len(posns)) for docnum, posns in as_posns] assert_equal(as_freq, roundtrip(postings, Characters(), "frequency"))
from whoosh.fields import (TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType, DATETIME) # CUSTOM ANALYZER wordsplit + lowercase filter for case insensitive search ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() # FILE INDEX SCHEMA DEFINITION FILE_INDEX_NAME = 'FILE_INDEX' FILE_SCHEMA = Schema( fileid=ID(unique=True), # Path repository=ID(stored=True), repository_id=NUMERIC(unique=True, stored=True), # Numeric id of repo repo_name=TEXT(stored=True), owner=TEXT(), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), md5=STORED(), extension=ID(stored=True), commit_id=TEXT(stored=True), size=NUMERIC(stored=True), mimetype=TEXT(stored=True), lines=NUMERIC(stored=True), ) # COMMIT INDEX SCHEMA COMMIT_INDEX_NAME = 'COMMIT_INDEX' COMMIT_SCHEMA = Schema(
from whoosh.formats import Characters from whoosh.highlight import highlight as whoosh_highlight, HtmlFormatter, ContextFragmenter from kallithea.lib.utils2 import LazyProperty log = logging.getLogger(__name__) # CUSTOM ANALYZER wordsplit + lowercase filter ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), owner=TEXT(), repository=TEXT(stored=True), path=TEXT(stored=True), content=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), modtime=STORED(), extension=TEXT(stored=True) ) IDX_NAME = 'HG_INDEX' FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') FRAGMENTER = ContextFragmenter(200) CHGSETS_SCHEMA = Schema( raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), owner=TEXT(), repository=ID(unique=True, stored=True),