def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) self.assertEquals(adapter.indexable, False) self.assertEquals(adapter.doc_attrs, {}) adapter = SAAdapter(Entity, schema) self.assertEquals(adapter.indexable, False) adapter = SAAdapter(SubclassEntityIndexable, schema) self.assertEquals(adapter.indexable, True) self.assertEquals(set(adapter.doc_attrs), set(('object_key', 'id', 'name', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator'))) self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())) self.assertEquals(set(schema.names()), set(('object_key', 'id', 'object_type', 'name', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator'))) schema = Schema( id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) self.assertEquals(adapter.indexable, True) self.assertEquals(set(adapter.doc_attrs), set(('id', 'text', 'num', 'name'))) self.assert_(all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues())) self.assertEquals(set(schema.names()), set(('id', 'text', 'num', 'name'))) self.assertTrue(isinstance(schema['text'], TEXT)) self.assertTrue(isinstance(schema['num'], NUMERIC))
def test_build_attrs(): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {} adapter = SAAdapter(Entity, schema) assert adapter.indexable == False adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "object_key", "id", "name", "slug", "object_type", "text", "created_at", "updated_at", "name_prefix", "owner", "owner_name", "creator_name", "creator", "allowed_roles_and_users", "tag_ids", "tag_text", } assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == { "object_key", "id", "object_type", "name", "slug", "text", "created_at", "updated_at", "name_prefix", "owner", "owner_name", "creator_name", "creator", "allowed_roles_and_users", "tag_ids", "tag_text", } schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True)) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == {"id", "text", "num", "name"} assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == {"id", "text", "num", "name"} assert isinstance(schema["text"], TEXT) assert isinstance(schema["num"], NUMERIC)
def test_build_attrs_4() -> None: schema = Schema( id=NUMERIC(bits=64, signed=False, stored=True, unique=True)) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "id", "text", "num", "name", "object_type", "object_key", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "id", "text", "num", "name", "object_type", "object_key", } assert isinstance(schema["text"], TEXT) assert isinstance(schema["num"], NUMERIC)
def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {} adapter = SAAdapter(Entity, schema) assert adapter.indexable == False adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { 'object_key', 'id', 'name', 'slug', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()) assert set(schema.names()) == { 'object_key', 'id', 'object_type', 'name', 'slug', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } schema = Schema( id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'} assert all(lambda f: callable(f) for f in adapter.doc_attrs.itervalues()) assert set(schema.names()) == {'id', 'text', 'num', 'name'} assert isinstance(schema['text'], TEXT) assert isinstance(schema['num'], NUMERIC)
def __init__(self, model: t.Type[Model], schema: Schema, index_name: t.Optional[str] = None): """ :param model: django.db.models.Model subclass whose instances are to be searched :param schema: field schema for the search index :param index_name: name of the search index :param pk_name: field name of the model's primary key """ self.model = model self.schema = schema self.pk_name = model._meta.pk.name self.schema.add(self.pk_name, PK_FIELDTYPE) self.index = self._init_index(index_name) query_fields = set(schema.names()) - {self.pk_name} self.query_parser = LectorQueryParser(query_fields, self.schema) self._search_cache: t.MutableMapping[str, Results] = cachetools.TTLCache(64, 60.0)
def test_build_attrs_3() -> None: schema = Schema() adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", }
def test_build_attrs_3(): schema = Schema() adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "allowed_roles_and_users", "created_at", "creator", "creator_name", "id", "name", "name_prefix", "object_key", "object_type", "owner", "owner_name", "slug", "tag_ids", "tag_text", "text", "updated_at", }
def test_build_attrs_4(): schema = Schema(id=NUMERIC(bits=64, signed=False, stored=True, unique=True)) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { "id", "text", "num", "name", "object_type", "object_key", } assert all(lambda f: callable(f) for f in adapter.doc_attrs.values()) assert set(schema.names()) == { "id", "text", "num", "name", "object_type", "object_key", } assert isinstance(schema["text"], TEXT) assert isinstance(schema["num"], NUMERIC)
from whoosh import fields # 打印支持的变量类型 print([item for item in dir(fields)[:10] if item.isupper() ]) # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD'] print(len(schema.items())) # 3 print( schema.items()[0] ) # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None)) print( schema.items()[1] ) # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False)) print( schema.items()[2] ) # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None)) print( schema.names() ) # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema. print( schema.scorable_names() ) # ['content', 'title']; Returns a list of the names of fields that store field lengths. print(schema.stored_names( )) # ['path', 'title']; Returns a list of the names of fields that are stored. print(schema.has_scorable_fields()) # True ################################################################## ## 2. 索引生成 ## create_in(dirname, schema, indexname=None) ## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. ix = create_in('./tmp', schema) # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError ** print(type(ix)) # <class 'whoosh.index.FileIndex'> print(ix.schema) # <Schema: ['content', 'path', 'title']>
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in ATTRS.keys(): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in info.pop('entity_attributes').items(): info[a] = v for a, v in info.items(): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in info.keys(): if not a in self.schema.names(): del info[a] for a, v in info.items(): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return self._collections def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield ATTRS[n] def attributes(self): return list(self._attributes()) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return list(searcher.lexicon(n)) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return self.objects.values() else: return self.infos.values() from whoosh.qparser import QueryParser #import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in ATTRS_INV.items(): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return list(lst)
class RedisWhooshStore(SAMLStoreBase ): # TODO: This needs a gc mechanism for keys (uuids) def json_dict(self, name): return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def xml_dict(self, name): return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', config.store_clear)) self._name = kwargs.pop('name', config.store_name) self._redis = kwargs.pop('redis', redis()) if clear: shutil.rmtree(self._dir) now = datetime.now() self._last_index_time = now self._last_modified = now self._setup() if clear: self.reset() def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex() def __getstate__(self): state = dict() for p in ('_dir', '_name', '_last_index_time', '_last_modified'): state[p] = getattr(self, p) return state def __setstate__(self, state): self.__dict__.update(state) self._setup() def __call__(self, *args, **kwargs): watched = kwargs.pop('watched', None) scheduler = kwargs.pop('scheduler', None) if watched is not None and scheduler is not None: super(RedisWhooshStore, self).__call__(watched=watched, scheduler=scheduler) log.debug("indexing using {}".format(scheduler)) if scheduler is not None: # and self._last_modified > self._last_index_time and : scheduler.add_job(RedisWhooshStore._reindex, args=[self], max_instances=1, coalesce=True, misfire_grace_time=2 * config.update_frequency) def _reindex(self): log.debug("indexing the store...") self._last_index_time = datetime.now() seen = set() refs = set([b2u(s) for s in self.objects.keys()]) parts = self.parts.values() for ref in refs: for part in parts: if ref in part['items']: seen.add(ref) ix = self.storage.open_index() lock = ix.lock("reindex") try: log.debug("waiting for index lock") lock.acquire(True) log.debug("got index lock") with ix.writer() as writer: for ref in refs: if ref not in seen: log.debug("removing unseen ref {}".format(ref)) del self.objects[ref] del self.parts[ref] log.debug("updating index") for e in self.objects.values(): info = self._index_prep(entity_simple_info(e)) ref = object_id(e) writer.add_document(object_id=ref, **info) writer.mergetype = CLEAR finally: try: log.debug("releasing index lock") lock.release() except ThreadError as ex: pass def dump(self): ix = self.storage.open_index() from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): res = dict() if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v content = " ".join( filter(lambda x: x is not None, [ info.get(x, '') for x in ('service_name', 'title', 'domain', 'keywords', 'scopes') ])) res['content'] = content.strip() for a, v in info.items(): k = a if a in ATTRS_INV: k = ATTRS_INV[a] if k in self.schema.names(): if type(v) in (list, tuple): res[k] = " ".join([vv.lower() for vv in v]) elif type(v) in six.string_types: res[k] = info[a].lower() res['sha1'] = hash_id(info['entity_id'], prefix=False) return res def update(self, t, tid=None, etag=None, lazy=True): relt = root(t) assert (relt is not None) if relt.tag == "{%s}EntityDescriptor" % NS['md']: ref = object_id(relt) parts = None if ref in self.parts: parts = self.parts[ref] if etag is not None and (parts is None or parts.get('etag', None) != etag): self.parts[ref] = { 'id': relt.get('entityID'), 'etag': etag, 'count': 1, 'items': [ref] } self.objects[ref] = relt self._last_modified = datetime.now() elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') if etag is None: etag = hex_digest(dumptree(t, pretty_print=False), 'sha256') parts = None if tid in self.parts: parts = self.parts[tid] if parts is None or parts.get('etag', None) != etag: items = set() for e in iter_entities(t): ref = object_id(e) items.add(ref) self.objects[ref] = e self.parts[tid] = { 'id': tid, 'count': len(items), 'etag': etag, 'items': list(items) } self._last_modified = datetime.now() if not lazy: self._reindex() @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def collections(self): return [b2u(ref) for ref in self.parts.keys()] def reset(self): for k in ('{}_{}'.format(self._name, 'parts'), '{}_{}'.format(self._name, 'objects')): self._redis.delete('{}_{}'.format(self._name, 'parts')) self._redis.delete('{}_{}'.format(self._name, 'objects')) def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def _prep_key(self, key): # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') key = key.replace('-', ' AND NOT ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() return key def _entities(self): lst = set() for ref_data in self.parts.values(): for ref in ref_data['items']: e = self.objects.get(ref, None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def lookup(self, key): if key == 'entities' or key is None: return self._entities() bkey = six.b(key) if bkey in self.objects: return [self.objects.get(bkey)] if bkey in self.parts: res = [] part = self.parts.get(bkey) for item in part['items']: res.extend(self.lookup(item)) return res key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def search(self, query=None, path=None, entity_filter=None, related=None): if entity_filter: query = "{!s} AND {!s}".format(query, entity_filter) query = self._prep_key(query) qp = MultifieldParser(['content', 'domain'], schema=self.schema) q = qp.parse(query) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) log.debug(results) for result in results: lst.add(result['object_id']) res = list() for ref in lst: e = self.objects.get(ref, None) if e is not None: res.append(discojson(e)) return res
def test_build_attrs(self): schema = Schema() adapter = SAAdapter(SANotIndexable, schema) assert not adapter.indexable assert adapter.doc_attrs == {} adapter = SAAdapter(Entity, schema) assert adapter.indexable == False adapter = SAAdapter(SubclassEntityIndexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == { 'object_key', 'id', 'name', 'slug', 'object_type', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == { 'object_key', 'id', 'object_type', 'name', 'slug', 'text', 'created_at', 'updated_at', 'name_prefix', 'owner', 'owner_name', 'creator_name', 'creator', 'allowed_roles_and_users', 'tag_ids', 'tag_text', } schema = Schema(id=NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=True), ) adapter = SAAdapter(Indexable, schema) assert adapter.indexable assert set(adapter.doc_attrs) == {'id', 'text', 'num', 'name'} assert all(lambda f: callable(f) for f in six.itervalues(adapter.doc_attrs)) assert set(schema.names()) == {'id', 'text', 'num', 'name'} assert isinstance(schema['text'], TEXT) assert isinstance(schema['num'], NUMERIC)
class WhooshStore(SAMLStoreBase): def __init__(self): self.schema = Schema(scopes=KEYWORD(), descr=TEXT(), service_name=TEXT(), service_descr=TEXT(), keywords=KEYWORD()) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self._collections = set() from whoosh.filedb.filestore import RamStorage, FileStorage self.storage = RamStorage() self.storage.create() self.index = self.storage.create_index(self.schema) self.objects = dict() self.infos = dict() def dump(self): ix = self.storage.open_index() print(ix.schema) from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v for a, v in list(info.items()): if type(v) is not list and type(v) is not tuple: info[a] = [info.pop(a)] if a in ATTRS_INV: info[ATTRS_INV[a]] = info.pop(a) for a in list(info.keys()): if a not in self.schema.names(): del info[a] for a, v in list(info.items()): info[a] = [six.text_type(vv) for vv in v] def _index(self, e, tid=None): info = entity_info(e) if tid is not None: info['collection_id'] = tid self._index_prep(info) id = six.text_type(object_id(e)) # mix in tid here self.infos[id] = info self.objects[id] = e ix = self.storage.open_index() with ix.writer() as writer: writer.add_document(object_id=id, **info) writer.mergetype = writing.CLEAR def update(self, t, tid=None, ts=None, merge_strategy=None): relt = root(t) assert (relt is not None) ne = 0 if relt.tag == "{%s}EntityDescriptor" % NS['md']: self._index(relt) ne += 1 elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') self._collections.add(tid) for e in iter_entities(t): self._index(e, tid=tid) ne += 1 return ne def collections(self): return b2u(self._collections) def reset(self): self.__init__() def size(self, a=None, v=None): if a is None: return len(list(self.objects.keys())) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))
################################################################## ## 1. 创建 schema schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) # stored 为 True 表示能够被检索 # All keyword arguments to the constructor are treated as fieldname = fieldtype pairs. # The fieldtype can be an instantiated FieldType object, or a FieldType sub-class # (in which case the Schema will instantiate it with the default constructor before adding it). # For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True)) # 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True from whoosh import fields # 打印支持的变量类型 print([item for item in dir(fields)[:10] if item.isupper()]) # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD'] print(len(schema.items())) # 3 print(schema.items()[0]) # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None)) print(schema.items()[1]) # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False)) print(schema.items()[2]) # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None)) print(schema.names()) # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema. print(schema.scorable_names()) # ['content', 'title']; Returns a list of the names of fields that store field lengths. print(schema.stored_names()) # ['path', 'title']; Returns a list of the names of fields that are stored. print(schema.has_scorable_fields()) # True ################################################################## ## 2. 索引生成 ## create_in(dirname, schema, indexname=None) ## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. ix = create_in('./tmp', schema) # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError ** print(type(ix)) # <class 'whoosh.index.FileIndex'> print(ix.schema) # <Schema: ['content', 'path', 'title']> ## writer(procs=1, **kwargs): Returns an IndexWriter object for this index. writer = ix.writer() # 按照 schema 定义信息, 增加需要建立索引的文档 print(type(writer)) # <class 'whoosh.writing.SegmentWriter'> print(writer.schema) # <Schema: ['content', 'path', 'title']> ## add_document(**fields): The keyword arguments map field names to the values to index/store
class Indexer: SOURCES = ['file', 'internet'] def __init__(self, source='file'): CURRENT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/search_engine" self.source = source if source == 'file' else 'internet' self.index_dir = "%s/%s_%s" % (CURRENT_DIR, 'indexdir', self.source) self.schema = Schema(title=TEXT(phrase=True, sortable=True, stored=True, field_boost=2.0, spelling=True, analyzer=StemmingAnalyzer()), url=ID(stored=True), body=TEXT(spelling=True, stored=True, analyzer=StemmingAnalyzer())) self.writer = None self.ix = None self.create_or_open_index() def add_document(self, doc, commit=True): writer = self.ix.writer() writer.add_document(title=doc['title'], url=doc['url'], body=doc['body']) if commit: writer.commit() def commit(self): writer = self.ix.writer() writer.commit() def get_doc(self, url): query = MultifieldParser(["url"], self.ix.schema).parse(url) return self.ix.searcher().search(query) def get_document_count(self): return self.ix.searcher().doc_count_all() def get_field_list(self): return self.schema.names() def get_word_count(self): return len(list(self.ix.searcher().lexicon("body"))) def get_doc_list(self, page_number, pagelen=20): result = [] for i, doc in enumerate(self.ix.searcher().documents()): if i in range((page_number - 1) * pagelen, (page_number) * pagelen): result.append({'index': i, 'title': doc['title'], 'url': doc['url'], 'body': doc['body'][:100]}) return result def search(self, query_str, page_number): query = MultifieldParser(["body", "title"], self.ix.schema).parse(query_str) docs = self.ix.searcher().search_page(query, page_number, pagelen=20) result = [] for doc in docs: result.append({'title': doc['title'], 'url': doc['url'], 'body': doc.highlights("body")}) return result def clean_index(self): # self.writer = self.ix.writer() if index.exists_in(self.index_dir): self.ix = create_in(self.index_dir, self.schema) def create_or_open_index(self): if index.exists_in(self.index_dir): self.ix = index.open_dir(self.index_dir) else: if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) self.ix = create_in(self.index_dir, self.schema)