class WorkspaceSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) owner = fields.TEXT(stored=True, spelling=True) name = fields.TEXT(stored=True, spelling=True) description = fields.NGRAM(stored=True, minsize=1, phrase=True) lastmodified = fields.DATETIME(stored=True) longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True) public = fields.BOOLEAN(stored=True) users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) shared = fields.BOOLEAN(stored=True)
def create_whoosh_schema(self): """ Dynamically create whoosh.fields.SchemaClass schema object. It defines how you index your dataset. :rtype: SchemaClass """ schema_classname = "WhooshSchema" schema_classname = str(schema_classname) attrs = OrderedDict() for c_setting in self.columns: if c_setting.type_is_ngram: field = fields.NGRAM( minsize=c_setting.ngram_minsize, maxsize=c_setting.ngram_maxsize, stored=True, ) elif c_setting.type_is_phrase: field = fields.TEXT(stored=True) elif c_setting.type_is_keyword: field = fields.KEYWORD( lowercase=c_setting.keyword_lowercase, commas=c_setting.keyword_commas, stored=True, ) else: field = fields.STORED() attrs[c_setting.name] = field SchemaClass = type(schema_classname, (fields.SchemaClass,), attrs) schema = SchemaClass() # type: SchemaClass return schema
def test_suggest_prefix(): domain = ("Shoot To Kill", "Bloom, Split and Deviate", "Rankle the Seas and the Skies", "Lightning Flash Flame Shell", "Flower Wind Rage and Flower God Roar, Heavenly Wind Rage and " "Heavenly Demon Sneer", "All Waves, Rise now and Become my Shield, Lightning, Strike " "now and Become my Blade", "Cry, Raise Your Head, Rain Without end", "Sting All Enemies To Death", "Reduce All Creation to Ash", "Sit Upon the Frozen Heavens", "Call forth the Twilight") schema = fields.Schema(content=fields.TEXT(stored=True, spelling=True), quick=fields.NGRAM(maxsize=10, stored=True)) with TempIndex(schema, "sugprefix") as ix: with ix.writer() as w: for item in domain: content = u(item) w.add_document(content=content, quick=content) with ix.searcher() as s: sugs = s.suggest("content", u("ra"), maxdist=2, prefix=2) assert sugs == ['rage', 'rain'] sugs = s.suggest("content", "ra", maxdist=2, prefix=1) assert sugs == ["rage", "rain", "roar"]
def create_whoosh_schema(self): schema_classname = "WhooshSchema" schema_classname = str(schema_classname) attrs = OrderedDict() for c in self.columns: if c in self.ngram_columns: field = fields.NGRAM( minsize=self.ngram_minsize, maxsize=self.ngram_maxsize, stored=True, ) elif c in self.phrase_columns: field = fields.TEXT(stored=True) elif c in self.keyword_columns: field = fields.KEYWORD( lowercase=self.keyword_lowercase, commas=self.keyword_commas, stored=True, ) else: field = fields.STORED() attrs[c] = field SchemaClass = type(schema_classname, (fields.SchemaClass,), attrs) schema = SchemaClass() return schema
class UserSchema(fields.SchemaClass): pk = fields.ID(stored=True, unique=True) full_name = fields.TEXT(stored=True, spelling=True) username = fields.TEXT(stored=True, spelling=True) email = fields.TEXT(stored=True, spelling=True) content = fields.NGRAM(phrase=True)
def test_ngram_phrase(): schema = fields.Schema(text=fields.NGRAM(minsize=2, maxsize=2, phrase=True), path=fields.ID(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(text=u('\u9AD8\u6821\u307E\u3067\u306F\u6771\u4EAC\u3067\u3001\u5927\u5B66\u304B\u3089\u306F\u4EAC\u5927\u3067\u3059\u3002'), path=u('sample')) writer.commit() with ix.searcher() as s: p = qparser.QueryParser("text", schema) q = p.parse(u('\u6771\u4EAC\u5927\u5B66')) assert_equal(len(s.search(q)), 1) q = p.parse(u('"\u6771\u4EAC\u5927\u5B66"')) assert_equal(len(s.search(q)), 0) q = p.parse(u('"\u306F\u6771\u4EAC\u3067"')) assert_equal(len(s.search(q)), 1)
from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser from whoosh import analysis from indexing.models import Job import sys import csv WHOOSH_SCHEMA = fields.Schema(jobtitle = fields.KEYWORD(stored=True), company= fields.KEYWORD(stored=True), city= fields.KEYWORD(stored=True), state= fields.KEYWORD(stored=True), country= fields.KEYWORD(stored=True), source= fields.KEYWORD(stored=True), date= fields.KEYWORD(stored=True), JD= fields.NGRAM(stored=True), url=fields.KEYWORD(stored=True), latitude=fields.KEYWORD(stored=True), longitude=fields.KEYWORD(stored=True), relative_time=fields.KEYWORD(stored=True), job_id = fields.KEYWORD(stored=True), category = fields.KEYWORD(stored=True) ) # ana = analysis.StemmingAnalyzer() columns = ["jobtitle", "company", "city", "state", "country", "source", "date", "JD","url", "latitude", "longitude", "relative_time", "job_id", "category"]
class BookmarkSchema(fields.SchemaClass): url = fields.NGRAM(minsize=2, maxsize=10, stored=True) title = fields.NGRAM(minsize=2, maxsize=10, stored=True)
class GPSSchema(fields.SchemaClass): url = fields.NGRAM(minsize=2, maxsize=10, stored=True) title = fields.NGRAM(minsize=2, maxsize=10, stored=True) tags = fields.KEYWORD()