class WorkspaceSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) owner = fields.TEXT(stored=True, spelling=True) name = fields.TEXT(stored=True, spelling=True) description = fields.NGRAM(stored=True, minsize=1, phrase=True) lastmodified = fields.DATETIME(stored=True) longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True) public = fields.BOOLEAN(stored=True) users = fields.KEYWORD(commas=True) groups = fields.KEYWORD(commas=True) shared = fields.BOOLEAN(stored=True)
def test_boolean_strings(): schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(i=0, b="true") w.add_document(i=1, b="True") w.add_document(i=2, b="false") w.add_document(i=3, b="False") w.add_document(i=4, b=u("true")) w.add_document(i=5, b=u("True")) w.add_document(i=6, b=u("false")) w.add_document(i=7, b=u("False")) with ix.searcher() as s: qp = qparser.QueryParser("b", ix.schema) def check(qs, nums): q = qp.parse(qs) r = s.search(q, limit=None) assert [hit["i"] for hit in r] == nums trues = [0, 1, 4, 5] falses = [2, 3, 6, 7] check("true", trues) check("True", trues) check("false", falses) check("False", falses) check("t", trues) check("f", falses)
def get_schema(model, analyzer): schema = {} primary = None searchable = set(getattr(model, '__searchable__', [])) for field in model.__table__.columns: # primary key id if field.primary_key: schema[field.name] = whoosh_fields.ID(stored=True, unique=True, sortable=True) primary = field.name if field.name not in searchable: continue # text types if isinstance(field.type, TEXT_TYPES): schema[field.name] = whoosh_fields.TEXT(analyzer=analyzer) elif isinstance(field.type, DATE_TYPES): is_unique = getattr(field, 'unique', False) schema[field.name] = whoosh_fields.DATETIME(unique=is_unique) elif isinstance(field.type, sql_types.Boolean): schema[field.name] = whoosh_fields.BOOLEAN() else: raise WhooshAlchemyError('cannot index column of type %s' % field.type) return whoosh_fields.Schema(**schema), primary
def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert_equal(sorted([d["id"] for d in r]), ["a", "c", "e"]) assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert_equal(sorted([d["id"] for d in r]), ["a", "c", "e"]) assert all(d["done"] for d in r) r = s.search(qp.parse("done:false")) assert_equal(sorted([d["id"] for d in r]), ["b", "d"]) assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert_equal(sorted([d["id"] for d in r]), ["b", "d"]) assert not any(d["done"] for d in r)
def test_boolean(): schema = fields.Schema(id=fields.ID(stored=True), done=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=u("a"), done=True) w.add_document(id=u("b"), done=False) w.add_document(id=u("c"), done=True) w.add_document(id=u("d"), done=False) w.add_document(id=u("e"), done=True) w.commit() with ix.searcher() as s: qp = qparser.QueryParser("id", schema) r = s.search(qp.parse("done:true")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) r = s.search(qp.parse("done:yes")) assert sorted([d["id"] for d in r]) == ["a", "c", "e"] assert all(d["done"] for d in r) q = qp.parse("done:false") assert q.__class__ == query.Term assert q.text is False assert schema["done"].to_bytes(False) == b("f") r = s.search(q) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r) r = s.search(qp.parse("done:no")) assert sorted([d["id"] for d in r]) == ["b", "d"] assert not any(d["done"] for d in r)
def create_index(self): if not os.path.exists("twitter_index"): os.mkdir("twitter_index") schema = fields.Schema(tweet_id=fields.TEXT(stored=True), batch=fields.NUMERIC(stored=True), content=fields.TEXT(stored=True), posted=fields.DATETIME(stored=True), owner_sn=fields.TEXT(stored=True), owner_id=fields.TEXT(stored=True), owner_name=fields.TEXT(stored=True), isRT=fields.BOOLEAN(stored=True), timesRT=fields.NUMERIC(stored=True), timesFav= fields.NUMERIC(stored=True), orig_timesRT=fields.NUMERIC(stored=True), orig_timesFav= fields.NUMERIC(stored=True), hashtags=fields.KEYWORD(stored=True), orgnlTweet = fields.TEXT(stored=True), mentions=fields.KEYWORD(stored=True), media = fields.TEXT(stored=True), url = fields.TEXT(stored=True), liwc=fields.TEXT(stored=True)) self.INDEX = index.create_in("twitter_index", schema, indexname="TWTTR") print("New searching index succesfully created") return self.INDEX
class TweetSchema(fields.SchemaClass): id = fields.ID(stored=True, unique=True) url = fields.ID(stored=True, unique=True) text = fields.TEXT(stored=True) source = fields.TEXT(stored=True) reply = fields.BOOLEAN(stored=True) in_reply_to_id = fields.TEXT(stored=True) in_reply_to_name = fields.TEXT(stored=True) user_mentions = fields.KEYWORD(stored=True) hashtags = fields.KEYWORD(stored=True) urls = fields.KEYWORD(stored=True) geo = fields.BOOLEAN(stored=True) latitude = fields.NUMERIC(stored=True) longitude = fields.NUMERIC(stored=True) date = fields.DATETIME(stored=True)
def test_boolean3(): schema = fields.Schema(t=fields.TEXT(stored=True, field_boost=5), b=fields.BOOLEAN(stored=True), c=fields.TEXT) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(t=u("with hardcopy"), b=True, c=u("alfa")) w.add_document(t=u("no hardcopy"), b=False, c=u("bravo")) with ix.searcher() as s: q = query.Term("b", schema["b"].to_bytes(True)) ts = [hit["t"] for hit in s.search(q)] assert ts == ["with hardcopy"]
def test_boolean_find_deleted(): # "Random" string of ones and zeros representing deleted and undeleted domain = "1110001010001110010101000101001011101010001011111101000101010101" schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) count = 0 # Create multiple segments just in case for _ in xrange(5): w = ix.writer() for c in domain: w.add_document(i=count, b=(c == "1")) w.commit(merge=False) # Delete documents where "b" is True with ix.writer() as w: w.delete_by_term("b", "t") with ix.searcher() as s: # Double check that documents with b=True are all deleted reader = s.reader() for docnum in xrange(s.doc_count_all()): b = s.stored_fields(docnum)["b"] assert b == reader.is_deleted(docnum) # Try doing a search for documents where b=True qp = qparser.QueryParser("b", ix.schema) q = qp.parse("b:t") r = s.search(q, limit=None) assert len(r) == 0 # Make sure Every query doesn't match deleted docs r = s.search(qp.parse("*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(qp.parse("*:*"), limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) # Make sure Not query doesn't match deleted docs q = qp.parse("NOT b:t") r = s.search(q, limit=None) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r) r = s.search(q, limit=5) assert not any(hit["b"] for hit in r) assert not any(reader.is_deleted(hit.docnum) for hit in r)
def test_query_schema_is_setup_correctly(self): # Given p = Project(name='test', path=self.root) # When p.scan() # Then schema = p._query_parser.schema items = schema.items() from whoosh import fields self.assertIn(('path', fields.TEXT()), items) self.assertIn(('ctime', fields.DATETIME()), items) self.assertIn(('completed', fields.BOOLEAN()), items) self.assertIn(('size', INT), items)
def crear_esquema(): licorSchema = fields.Schema( id=fields.NUMERIC(stored=True), titulo=fields.TEXT(sortable=True, field_boost=1.5), descripcion=fields.TEXT, categoria=fields.TEXT(sortable=True), precio=fields.NUMERIC(Decimal, decimal_places=2, sortable=True), precioGroup=fields.NUMERIC(sortable=True), origen=fields.TEXT(sortable=True), graduacion=fields.NUMERIC(sortable=True), enStock=fields.BOOLEAN(stored=True), urlProducto=fields.TEXT(field_boost=0.5), ) return licorSchema
def test_boolean_multifield(): schema = fields.Schema(name=fields.TEXT(stored=True), bit=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u('audi'), bit=True) w.add_document(name=u('vw'), bit=False) w.add_document(name=u('porsche'), bit=False) w.add_document(name=u('ferrari'), bit=True) w.add_document(name=u('citroen'), bit=False) with ix.searcher() as s: qp = qparser.MultifieldParser(["name", "bit"], schema) q = qp.parse(u("boop")) r = s.search(q) assert sorted(hit["name"] for hit in r) == ["audi", "ferrari"] assert len(r) == 2
def test_boolean2(): schema = fields.Schema(t=fields.TEXT(stored=True), b=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) writer = ix.writer() writer.add_document(t=u('some kind of text'), b=False) writer.add_document(t=u('some other kind of text'), b=False) writer.add_document(t=u('some more text'), b=False) writer.add_document(t=u('some again'), b=True) writer.commit() with ix.searcher() as s: qf = qparser.QueryParser('b', None).parse(u('f')) qt = qparser.QueryParser('b', None).parse(u('t')) r = s.search(qf) assert len(r) == 3 assert [d["b"] for d in s.search(qt)] == [True] assert [d["b"] for d in s.search(qf)] == [False] * 3
def _init_schema(): schema = fields.Schema() schema.add("id", fields.ID(unique=True, stored=True)) schema.add("short_id", fields.ID(stored=True)) schema.add("status", fields.ID(stored=True)) schema.add("started", fields.DATETIME(stored=True)) schema.add("stopped", fields.DATETIME(stored=True)) schema.add("pkg_type", fields.ID(stored=True)) schema.add("pkg_name", fields.ID(stored=True)) schema.add("pkg_version", fields.ID(stored=True)) schema.add("model_name", fields.ID(stored=True)) schema.add("op_name", fields.ID(stored=True)) schema.add("label", fields.TEXT(stored=True)) schema.add("scalar_*", fields.NUMERIC(float, stored=True), glob=True) schema.add("flagi_*", fields.NUMERIC(int, stored=True), glob=True) schema.add("flagf_*", fields.NUMERIC(int, stored=True), glob=True) schema.add("flagb_*", fields.BOOLEAN(stored=True), glob=True) schema.add("flags_*", fields.ID(stored=True), glob=True) schema.add("priv_*", fields.STORED, glob=True) return schema
from whoosh import fields from whoosh.index import create_in, exists_in, open_dir from whoosh.qparser import FuzzyTermPlugin, GtLtPlugin, MultifieldParser, PhrasePlugin from whoosh.writing import AsyncWriter from . import ConfigNotFound, EncryptedConfigStore, KEY_FIELD_NAME from .serializers import Serializer from jumpscale.sals.fs import join_paths, mkdirs # a map betwen our indexable fields and whoosh fields # for now we don't support nested fields like Lists or Objects # they will only be stored but not indexed FIELD_MAP = { "Boolean": fields.BOOLEAN(stored=True), "Bytes": fields.TEXT(stored=True), "Email": fields.TEXT(stored=True), "GUID": fields.TEXT(stored=True), "IPAddress": fields.TEXT(stored=True), "IPRange": fields.TEXT(stored=True), "Json": fields.TEXT(stored=True), "Path": fields.ID(stored=True), "String": fields.TEXT(stored=True), "Tel": fields.TEXT(stored=True), "URL": fields.TEXT(stored=True), "Integer": fields.NUMERIC(bits=64, stored=True, sortable=True), "Float": fields.NUMERIC(float, bits=64, stored=True, sortable=True), "Port": fields.NUMERIC(stored=True, sortable=True), "Date": fields.NUMERIC(stored=True, sortable=True), "DateTime": fields.NUMERIC(stored=True, sortable=True), "Time": fields.NUMERIC(stored=True, sortable=True),
class Objects(object): INDEX_DIR = os.path.join(Globals.BASE_DIR, 'objects') INDEX = None SCHEMA = None TYPES = { u'escidoc-objid': fields.TEXT, u'old': fields.TEXT, u'text': fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True, chars=True), u'num': fields.NUMERIC(stored=True), u'boolean': fields.BOOLEAN(stored=True), u'bool': fields.BOOLEAN(stored=True), u'date': fields.TEXT(stored=True), u'arabic': fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True, chars=True), u'geoname-id': fields.TEXT } @classmethod def schema_fields(cls): sfields = set( (att['ov'], att['content_type']) for att in Definitions.all_atts()) sfields = {k: cls.TYPES[v] for k, v in sfields} sfields.update(ov=fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True, chars=True), nov=fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True, chars=True), oc=fields.ID, id=fields.ID(stored=True, unique=True)) return sfields @classmethod def get_schema(cls): if cls.SCHEMA is None: cls.SCHEMA = fields.Schema(**cls.schema_fields()) return cls.SCHEMA @classmethod def get_index(cls): if cls.INDEX is None: if not os.path.exists(cls.INDEX_DIR): os.mkdir(cls.INDEX_DIR) if index.exists_in(cls.INDEX_DIR): cls.INDEX = index.open_dir(cls.INDEX_DIR) else: cls.INDEX = index.create_in(cls.INDEX_DIR, cls.get_schema()) for json_file in filter(lambda s: ".json" in s, os.listdir(Globals.DATA_DIR)): fp = open(os.path.join(Globals.DATA_DIR, json_file), 'r') resp = json.load(fp) fp.close() for ent in resp['ents']: cls.add_ent(ent) return cls.INDEX @classmethod def add_ent(cls, ent): writer = cls.get_index().writer() kwargs = { 'ov': unicode(ent.get('ov', '')), 'nov': unicode(ent.get('nov', '')), 'oc': unicode(ent.get('oc', '')), 'id': unicode(ent.get('id', '')) } if 'atts' in ent: kwargs.update({ att['name'].lower(): unicode(att.get('ov', '')) for att in ent['atts'] }) writer.add_document(**kwargs) writer.commit() @classmethod def delete(cls, iden): writer = cls.get_index().writer() writer.delete_by_term('id', iden) writer.commit() @classmethod def sync_db(cls): for d in Definitions.get_defs(): resp = JsonInterface.method('get_ents', oc=d['ov'], include_content='true') with cls.get_index().searcher() as s: for ent in resp['ents']: stored_fields = s.document(id=unicode(ent['id'])) if stored_fields: cls.delete(unicode(ent['id'])) cls.add_ent(ent) @classmethod def search(cls, filters, **kwargs): schema = cls.get_schema() def parse(filt): if filt.query_type == Filter.Q_APPROX: mp = qparser.MultifieldParser(filt.get_fields(), schema=schema) return mp.parse(unicode(filt.query_string)) elif filt.query_type == Filter.Q_EXACT: s = cls.get_index().searcher() qs = filt.query_string f = lambda d: qs in [ d.get(field) for field in filt.get_fields() ] ids = [unicode(d['id']) for d in filter(f, s.documents())] return query.Or([query.Term('id', iden) for iden in ids]) queries = [parse(filt) for filt in filters] q = query.And(queries) kwargs.update(limit=None, groupedby='oc') results = cls.get_index().searcher().search(q, **kwargs) return Results(results)
class ISPWhoosh(object): """ Helper class to index the ISP model with Whoosh to allow full-text search """ schema = fields.Schema( id=fields.ID(unique=True, stored=True), is_ffdn_member=fields.BOOLEAN(), is_disabled=fields.BOOLEAN(), name=fields.TEXT(), shortname=fields.TEXT(), description=fields.TEXT(), covered_areas=fields.KEYWORD(scorable=True, commas=True, lowercase=True), step=fields.NUMERIC(signed=False), ) primary_key=schema._fields['id'] @staticmethod def get_index_dir(): return current_app.config.get('WHOOSH_INDEX_DIR', 'whoosh') @classmethod def get_index(cls): idxdir = cls.get_index_dir() if index.exists_in(idxdir): idx = index.open_dir(idxdir) else: if not os.path.exists(idxdir): os.makedirs(idxdir) idx = index.create_in(idxdir, cls.schema) return idx @classmethod def _search(cls, s, terms): return s.search(qparser.MultifieldParser([ 'name', 'shortname', 'description', 'covered_areas' ], schema=cls.schema).parse(terms), mask=whoosh.query.Term('is_disabled', True)) @classmethod def search(cls, terms): with ISPWhoosh.get_index().searcher() as s: sres = cls._search(s, terms) ranks = {} for rank, r in enumerate(sres): ranks[r['id']] = rank if not len(ranks): return [] _res = ISP.query.filter(ISP.id.in_(ranks.keys())) return sorted(_res, key=lambda r: ranks[r.id]) @classmethod def update_document(cls, writer, model): kw = { 'id': unicode(model.id), '_stored_id': model.id, 'is_ffdn_member': model.is_ffdn_member, 'is_disabled': model.is_disabled, 'name': model.name, 'shortname': model.shortname, 'description': model.json.get('description'), 'covered_areas': ','.join(model.covered_areas_names()), 'step': model.json.get('progressStatus') } writer.update_document(**kw) @classmethod def _after_flush(cls, app, changes): isp_changes = [] for change in changes: if change[0].__class__ == ISP: update = change[1] in ('update', 'insert') isp_changes.append((update, change[0])) if not len(changes): return idx = cls.get_index() with idx.writer() as writer: for update, model in isp_changes: if update: cls.update_document(writer, model) else: writer.delete_by_term(cls.primary_key, model.id)