def get_cache_schema(): schema = Schema( key=ID(unique=True, stored=True), # Copied from Zotero. version=NUMERIC(stored=True), # Copied from Zotero. parentItem=ID(stored=True), # Kerko addition. itemType=ID(stored=True), # Kerko addition. library=STORED, # Copied from Zotero & JSON-encoded. links=STORED, # Copied from Zotero & JSON-encoded. meta=STORED, # Copied from Zotero & JSON-encoded. data=STORED, # Copied from Zotero & JSON-encoded. fulltext=STORED, # Kerko addition. ) for format_ in get_formats(): schema.add(format_, STORED) return schema
def __init__(self, index_storage, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_storage = index_storage self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # namespace, so we can have different namespaces within a wiki, always check this! NAMESPACE: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # backend name (which backend is this rev stored in?) BACKENDNAME: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # publish time from metadata (converted to UTC datetime) PTIME: DATETIME(stored=True), # ITEMTYPE from metadata, always matched exactly hence ID ITEMTYPE: ID(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # DATAID from metadata DATAID: ID(stored=True), # TRASH from metadata TRASH: BOOLEAN(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True, spelling=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), # ngram words, index ngrams of words from main content CONTENTNGRAM: NGRAMWORDS(minsize=3, maxsize=6), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), DISABLED: BOOLEAN(stored=True), LOCALE: ID(stored=True), SUBSCRIPTION_IDS: ID(), SUBSCRIPTION_PATTERNS: ID(), } latest_revs_fields.update(**userprofile_fields) # XXX This is a highly adhoc way to support indexing of ticket items. ticket_fields = { EFFORT: NUMERIC(stored=True), DIFFICULTY: NUMERIC(stored=True), SEVERITY: NUMERIC(stored=True), PRIORITY: NUMERIC(stored=True), ASSIGNED_TO: ID(stored=True), SUPERSEDED_BY: ID(stored=True), DEPENDS_ON: ID(stored=True), CLOSED: BOOLEAN(stored=True), } latest_revs_fields.update(**ticket_fields) blog_entry_fields = { } latest_revs_fields.update(**blog_entry_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
def init_indexes_and_parsers(): path = app.config['SEARCH_INDEX_PATH'] # Initialize the documentations index name = 'doc' if exists_in(path, indexname=name): indexes['doc'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), ) schema.add( 'title_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add('text_*', TEXT(analyzer=domotego_analyzer), glob=True) indexes['doc'] = create_in(path, schema, indexname=name) index_docs(Page.objects(pagetype='doc')) # Initialize the categories index name = 'category' if exists_in(path, indexname=name): indexes['category'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), ) schema.add( 'name_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add( 'description_*', TEXT(analyzer=domotego_analyzer), glob=True ) indexes['category'] = create_in(path, schema, indexname=name) index_categories(Category.objects) # Initialize the products index name = 'product' if exists_in(path, indexname=name): indexes['product'] = open_dir(path, indexname=name) else: try: os.makedirs(path) except OSError: pass schema = Schema( id=ID(stored=True, unique=True), reference=KEYWORD, keywords=KEYWORD(lowercase=True, field_boost=1.5) ) schema.add( 'name_*', TEXT(field_boost=2.0, analyzer=domotego_analyzer), glob=True ) schema.add( 'description_*', TEXT(analyzer=domotego_analyzer), glob=True ) indexes['product'] = create_in(path, schema, indexname=name) index_products(BaseProduct.objects) # Initialize the parsers docparserfields = [] categoryparserfields = [] productparserfields = ['reference', 'keywords'] for lg in app.config['LANGS']: docparserfields.append('title_'+lg) docparserfields.append('text_'+lg) categoryparserfields.append('name_'+lg) categoryparserfields.append('description_'+lg) productparserfields.append('name_'+lg) productparserfields.append('description_'+lg) parsers['doc'] = qparser.MultifieldParser( docparserfields, schema=indexes['doc'].schema, termclass=FuzzierTerm ) parsers['category'] = qparser.MultifieldParser( categoryparserfields, schema=indexes['category'].schema, termclass=FuzzierTerm ) parsers['product'] = qparser.MultifieldParser( productparserfields, schema=indexes['product'].schema, termclass=FuzzierTerm )
class IndexSchema(): """ Init schema and build a custom analyzer. All data to index will be put inside the """ def __init__(self): chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter # defines the schema # see http://pythonhosted.org/Whoosh/schema.html for reference keywordType = KEYWORD(lowercase=True, scorable=True) self.schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True, unique=True), tags=keywordType) # Adds dynamic fields so each documents can index its fields in the # same Whoosh index self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True) self.schema.add('*_date', DATETIME, glob=True) self.schema.add('*_number', NUMERIC, glob=True) self.schema.add('*_boolean', BOOLEAN, glob=True) # Creates the index folder and Whoosh index files if it doesn't exist # And loads the index in any case if not os.path.exists("indexes"): os.mkdir("indexes") self.index = index.create_in("indexes", self.schema) else: self.index = index.open_dir("indexes") # Creates the doctypes folder if it doesn't exist if not os.path.exists("doctypes"): os.mkdir("doctypes") # Creates the doctypes default schema file if it doesn't exist if not os.path.exists('doctypes/doctypes_schema.json'): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}") ''' Loads the doctypes schema if it's valid, otherwise recreates it Doctypes schema is a dictionary of doctypes with their fields created and updated when a document is indexed. That way, we can tell Whoosh which fields to search by default, because there is apparently no way to say "search in all fields". ''' with open('doctypes/doctypes_schema.json', 'r+') as rawJSON: try: self.doctypesSchema = json.load(rawJSON) except ValueError: rawJSON.write("{}") self.doctypesSchema = {} def update_doctypes_schema(self, schemaToUpdate): """ Updates and persists the doctypes schema in its file """ self.doctypesSchema.update(schemaToUpdate) with open('doctypes/doctypes_schema.json', 'w') as fileObject: fileObject.write(json.dumps(self.doctypesSchema)) def clear_index(self): """ Clear index: whoosh indexe create, create a new index in the directory even if an index exists. """ if os.path.exists("indexes"): index.create_in("indexes", self.schema) if os.path.exists("doctypes"): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}")
writer.delete_field("path") # Don't do this!!! writer.add_field("path", fields.KEYWORD) (Whoosh将来的版本可能会自动处理这个错误) Dynamic fields 动态fields可以使用通配符名字将field联系起来 可以使用add()方法(关键字参数glob为真)添加dynamic fields到一个新的schema: [python] view plain copy schema = fields.Schema(...) # Any name ending in "_d" will be treated as a stored # DATETIME field schema.add("*_d", fields.DATETIME(stored=True), glob=True) 在一个已经存在的索引上面设置dynamic fields,使用indexWriter.add_field方法就像你添加一个通常的field一样,保证glob参数为True [python] view plain copy writer = ix.writer() writer.add_field("*_d", fields.DATETIME(stored=True), glob=True) writer.commit() 删除一个dynamic fields可以使用IndexWriter.remove_field()方法(用glob作为名字) [html] view plain copy writer = ix.writer() writer.remove_field("*_d") writer.commit() 例如。为了使document包含以_id结尾的任意field名字,并且将他与所有的IDfield类型联系起来: [python] view plain copy schema = fields.Schema(path=fields.ID) schema.add("*_id", fields.ID, glob=True)
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in info.pop('entity_attributes').items(): info[a] = v for a, v in info.items(): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in info.keys(): if not a in self.schema.names(): del info[a] for a, v in info.items(): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return self._collections def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield ATTRS[n] def attributes(self): return list(self._attributes()) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return list(searcher.lexicon(n)) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return self.objects.values() else: return self.infos.values() from whoosh.qparser import QueryParser #import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in ATTRS_INV.items(): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return list(lst)
class RedisWhooshStore(SAMLStoreBase ): # TODO: This needs a gc mechanism for keys (uuids) def json_dict(self, name): return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def xml_dict(self, name): return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', config.store_clear)) self._name = kwargs.pop('name', config.store_name) self._redis = kwargs.pop('redis', redis()) if clear: shutil.rmtree(self._dir) now = datetime.now() self._last_index_time = now self._last_modified = now self._setup() if clear: self.reset() def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex() def __getstate__(self): state = dict() for p in ('_dir', '_name', '_last_index_time', '_last_modified'): state[p] = getattr(self, p) return state def __setstate__(self, state): self.__dict__.update(state) self._setup() def __call__(self, *args, **kwargs): watched = kwargs.pop('watched', None) scheduler = kwargs.pop('scheduler', None) if watched is not None and scheduler is not None: super(RedisWhooshStore, self).__call__(watched=watched, scheduler=scheduler) log.debug("indexing using {}".format(scheduler)) if scheduler is not None: # and self._last_modified > self._last_index_time and : scheduler.add_job(RedisWhooshStore._reindex, args=[self], max_instances=1, coalesce=True, misfire_grace_time=2 * config.update_frequency) def _reindex(self): log.debug("indexing the store...") self._last_index_time = datetime.now() seen = set() refs = set([b2u(s) for s in self.objects.keys()]) parts = self.parts.values() for ref in refs: for part in parts: if ref in part['items']: seen.add(ref) ix = self.storage.open_index() lock = ix.lock("reindex") try: log.debug("waiting for index lock") lock.acquire(True) log.debug("got index lock") with ix.writer() as writer: for ref in refs: if ref not in seen: log.debug("removing unseen ref {}".format(ref)) del self.objects[ref] del self.parts[ref] log.debug("updating index") for e in self.objects.values(): info = self._index_prep(entity_simple_info(e)) ref = object_id(e) writer.add_document(object_id=ref, **info) writer.mergetype = CLEAR finally: try: log.debug("releasing index lock") lock.release() except ThreadError as ex: pass def dump(self): ix = self.storage.open_index() from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): res = dict() if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v content = " ".join( filter(lambda x: x is not None, [ info.get(x, '') for x in ('service_name', 'title', 'domain', 'keywords', 'scopes') ])) res['content'] = content.strip() for a, v in info.items(): k = a if a in ATTRS_INV: k = ATTRS_INV[a] if k in self.schema.names(): if type(v) in (list, tuple): res[k] = " ".join([vv.lower() for vv in v]) elif type(v) in six.string_types: res[k] = info[a].lower() res['sha1'] = hash_id(info['entity_id'], prefix=False) return res def update(self, t, tid=None, etag=None, lazy=True): relt = root(t) assert (relt is not None) if relt.tag == "{%s}EntityDescriptor" % NS['md']: ref = object_id(relt) parts = None if ref in self.parts: parts = self.parts[ref] if etag is not None and (parts is None or parts.get('etag', None) != etag): self.parts[ref] = { 'id': relt.get('entityID'), 'etag': etag, 'count': 1, 'items': [ref] } self.objects[ref] = relt self._last_modified = datetime.now() elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') if etag is None: etag = hex_digest(dumptree(t, pretty_print=False), 'sha256') parts = None if tid in self.parts: parts = self.parts[tid] if parts is None or parts.get('etag', None) != etag: items = set() for e in iter_entities(t): ref = object_id(e) items.add(ref) self.objects[ref] = e self.parts[tid] = { 'id': tid, 'count': len(items), 'etag': etag, 'items': list(items) } self._last_modified = datetime.now() if not lazy: self._reindex() @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def collections(self): return [b2u(ref) for ref in self.parts.keys()] def reset(self): for k in ('{}_{}'.format(self._name, 'parts'), '{}_{}'.format(self._name, 'objects')): self._redis.delete('{}_{}'.format(self._name, 'parts')) self._redis.delete('{}_{}'.format(self._name, 'objects')) def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def _prep_key(self, key): # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') key = key.replace('-', ' AND NOT ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() return key def _entities(self): lst = set() for ref_data in self.parts.values(): for ref in ref_data['items']: e = self.objects.get(ref, None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def lookup(self, key): if key == 'entities' or key is None: return self._entities() bkey = six.b(key) if bkey in self.objects: return [self.objects.get(bkey)] if bkey in self.parts: res = [] part = self.parts.get(bkey) for item in part['items']: res.extend(self.lookup(item)) return res key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def search(self, query=None, path=None, entity_filter=None, related=None): if entity_filter: query = "{!s} AND {!s}".format(query, entity_filter) query = self._prep_key(query) qp = MultifieldParser(['content', 'domain'], schema=self.schema) q = qp.parse(query) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) log.debug(results) for result in results: lst.add(result['object_id']) res = list() for ref in lst: e = self.objects.get(ref, None) if e is not None: res.append(discojson(e)) return res
def __init__(self, index_dir, backend, wiki_name=None, acl_rights_contents=[], **kw): """ Store params, create schemas. """ self.index_dir = index_dir self.index_dir_tmp = index_dir + '.temp' self.backend = backend self.wikiname = wiki_name self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # parent revision id PARENTID: ID(stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # tokenized CONTENTTYPE from metadata CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # SUMMARY from metadata SUMMARY: TEXT(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True), } latest_revs_fields.update(**common_fields) userprofile_fields = { # Note: email / openid (if given) should be unique, but we might # have lots of empty values if it is not given and thus it is NOT # unique overall! Wrongly declaring it unique would lead to whoosh # killing other users from index when update_document() is called! EMAIL: ID(stored=True), OPENID: ID(stored=True), } latest_revs_fields.update(**userprofile_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema # what fields could whoosh result documents have (no matter whether all revs index # or latest revs index): self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
def _build_doc_attrs(self, model_class: Type[Model], schema: Schema) -> None: mapper = sa.inspect(model_class) args = self.doc_attrs # Any field not in schema will be stored here. # After all field have been discovered, we add the missing ones. field_definitions = {} def setup_field( attr_name: str, field_name: Union[Tuple[str, Union[type, ID]], str]) -> None: field_def = False if not isinstance(field_name, str): field_name, field_def = field_name if field_name not in schema: if (field_name not in field_definitions or field_definitions[field_name] is False): field_definitions[field_name] = field_def # attrgetter offers dotted name support. Useful for attributes on # related objects. args.setdefault(field_name, {})[attr_name] = attrgetter(attr_name) # model level definitions for name, field_names in self.index_to: if isinstance(field_names, str): field_names = (field_names, ) for field_name in field_names: setup_field(name, field_name) # per column definitions for col in mapper.columns: name = col.name info = col.info if not info.get("searchable"): continue index_to = info.get("index_to", (name, )) if isinstance(index_to, str): index_to = (index_to, ) for field_name in index_to: setup_field(name, field_name) # add missing fields to schema for field_name, field_def in field_definitions.items(): if field_name in schema: continue if field_def is False: field_def = TEXT(stored=True, analyzer=accent_folder) logger.debug( "Adding field to schema:\n" " Model: %s\n" ' Field: "%s" %s', model_class._object_type(), field_name, field_def, ) schema.add(field_name, field_def)
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v for a, v in list(info.items()): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in list(info.keys()): if a not in self.schema.names(): del info[a] for a, v in list(info.items()): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return b2u(self._collections) def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(list(self.objects.keys())) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))
def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw): """ Store params, create schemas. """ self.index_dir = index_dir self.index_dir_tmp = index_dir + '.temp' self.backend = backend self.user_name = user_name # TODO use currently logged-in username self.acl_support = acl_support self.wikiname = u'' # TODO take from app.cfg.interwikiname self.ix = {} # open indexes self.schemas = {} # existing schemas common_fields = { # wikiname so we can have a shared index in a wiki farm, always check this! WIKINAME: ID(stored=True), # tokenized NAME from metadata - use this for manual searching from UI # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0), NAME: ID(stored=True, field_boost=2.0), # unmodified NAME from metadata - use this for precise lookup by the code. # also needed for wildcard search, so the original string as well as the query # (with the wildcard) is not cut into pieces. NAME_EXACT: ID(field_boost=3.0), # revision id (aka meta id) REVID: ID(unique=True, stored=True), # MTIME from revision metadata (converted to UTC datetime) MTIME: DATETIME(stored=True), # tokenized CONTENTTYPE from metadata # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()), CONTENTTYPE: ID(stored=True), # unmodified list of TAGS from metadata TAGS: ID(stored=True), LANGUAGE: ID(stored=True), # USERID from metadata TODO: -> user ITEMID USERID: ID(stored=True), # ADDRESS from metadata ADDRESS: ID(stored=True), # HOSTNAME from metadata HOSTNAME: ID(stored=True), # SIZE from metadata SIZE: NUMERIC(stored=True), # ACTION from metadata ACTION: ID(stored=True), # tokenized COMMENT from metadata COMMENT: TEXT(stored=True), # data (content), converted to text/plain and tokenized CONTENT: TEXT(stored=True), } latest_revs_fields = { # ITEMID from metadata - as there is only latest rev of same item here, it is unique ITEMID: ID(unique=True, stored=True), # unmodified list of ITEMLINKS from metadata ITEMLINKS: ID(stored=True), # unmodified list of ITEMTRANSCLUSIONS from metadata ITEMTRANSCLUSIONS: ID(stored=True), # tokenized ACL from metadata # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True), ACL: ID(stored=True), } latest_revs_fields.update(**common_fields) userprofile_fields = { EMAIL: ID(unique=True, stored=True), OPENID: ID(unique=True, stored=True), } latest_revs_fields.update(**userprofile_fields) all_revs_fields = { ITEMID: ID(stored=True), } all_revs_fields.update(**common_fields) latest_revisions_schema = Schema(**latest_revs_fields) all_revisions_schema = Schema(**all_revs_fields) # Define dynamic fields dynamic_fields = [("*_id", ID(stored=True)), ("*_text", TEXT(stored=True)), ("*_keyword", KEYWORD(stored=True)), ("*_numeric", NUMERIC(stored=True)), ("*_datetime", DATETIME(stored=True)), ("*_boolean", BOOLEAN(stored=True)), ] # Adding dynamic fields to schemas for glob, field_type in dynamic_fields: latest_revisions_schema.add(glob, field_type, glob=True) all_revisions_schema.add(glob, field_type, glob=True) # schemas are needed by query parser and for index creation self.schemas[ALL_REVS] = all_revisions_schema self.schemas[LATEST_REVS] = latest_revisions_schema
class IndexConfig: def __init__(self, config_dict): self.__index_config_dict = config_dict self.__schema = Schema() try: for field_name in self.__index_config_dict['schema'].keys(): field_type = self.__get_field_type( self.__index_config_dict['schema'][field_name] ['field_type']) for arg in self.__index_config_dict['schema'][field_name][ 'args'].keys(): setattr( field_type, arg, self.__index_config_dict['schema'] [field_name]['args'][arg]) self.__schema.add(field_name, field_type, glob=False) if not self.__validate(): raise ValueError('invalid schema') except Exception as ex: raise ex def __get_filter(self, name): class_name = self.__index_config_dict['filters'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['filters'][name]: class_args = deepcopy( self.__index_config_dict['filters'][name]['args']) instance = get_instance(class_name, **class_args) return instance def __get_tokenizer(self, name): class_name = self.__index_config_dict['tokenizers'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['tokenizers'][name]: class_args = deepcopy( self.__index_config_dict['tokenizers'][name]['args']) instance = get_instance(class_name, **class_args) return instance def __get_analyzer(self, name): instance = None if 'class' in self.__index_config_dict['analyzers'][name]: class_name = self.__index_config_dict['analyzers'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['analyzers'][name]: class_args = deepcopy( self.__index_config_dict['analyzers'][name]['args']) instance = get_instance(class_name, **class_args) elif 'tokenizer' in self.__index_config_dict['analyzers'][name]: instance = self.__get_tokenizer( self.__index_config_dict['analyzers'][name]['tokenizer']) if 'filters' in self.__index_config_dict['analyzers'][name]: for filter_name in self.__index_config_dict['analyzers'][name][ 'filters']: instance = instance | self.__get_filter(filter_name) return instance def __get_field_type(self, name): class_name = self.__index_config_dict['field_types'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['field_types'][name]: class_args = deepcopy( self.__index_config_dict['field_types'][name]['args']) if 'analyzer' in class_args: class_args['analyzer'] = self.__get_analyzer( class_args['analyzer']) if class_args['analyzer'] else None if 'tokenizer' in class_args: class_args['tokenizer'] = self.__get_tokenizer( class_args['tokenizer'] ) if class_args['tokenizer'] else None instance = get_instance(class_name, **class_args) return instance def __get_unique_fields(self): return [name for name, field in self.__schema.items() if field.unique] def __validate(self): valid = False if len(self.__get_unique_fields()) == 1: valid = True return valid def get_schema(self): return self.__schema def get_doc_id_field(self): return self.__get_unique_fields()[0] def get_storage_type(self): try: storage_type = self.__index_config_dict['storage']['type'] except KeyError: storage_type = 'file' return storage_type def get_writer_processors(self): try: procs = self.__index_config_dict['writer']['processors'] except KeyError: procs = 1 return procs def get_writer_batch_size(self): try: batch_size = self.__index_config_dict['writer']['batch_size'] except KeyError: batch_size = 100 return batch_size def get_writer_multi_segment(self): try: multi_segment = self.__index_config_dict['writer']['multi_segment'] except KeyError: multi_segment = False return multi_segment def get_writer_auto_commit_period(self): try: period = self.__index_config_dict['writer']['auto_commit'][ 'period'] except KeyError: period = 0 return period def get_writer_auto_commit_limit(self): try: limit = self.__index_config_dict['writer']['auto_commit']['limit'] except KeyError: limit = 10 return limit