コード例 #1
0
def get_cache_schema():
    schema = Schema(
        key=ID(unique=True, stored=True),  # Copied from Zotero.
        version=NUMERIC(stored=True),  # Copied from Zotero.
        parentItem=ID(stored=True),  # Kerko addition.
        itemType=ID(stored=True),  # Kerko addition.
        library=STORED,  # Copied from Zotero & JSON-encoded.
        links=STORED,  # Copied from Zotero & JSON-encoded.
        meta=STORED,  # Copied from Zotero & JSON-encoded.
        data=STORED,  # Copied from Zotero & JSON-encoded.
        fulltext=STORED,  # Kerko addition.
    )
    for format_ in get_formats():
        schema.add(format_, STORED)
    return schema
コード例 #2
0
ファイル: indexing.py プロジェクト: denedios/moin-2.0
    def __init__(self, index_storage, backend, wiki_name=None, acl_rights_contents=[], **kw):
        """
        Store params, create schemas.
        """
        self.index_storage = index_storage
        self.backend = backend
        self.wikiname = wiki_name
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # namespace, so we can have different namespaces within a wiki, always check this!
            NAMESPACE: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # parent revision id
            PARENTID: ID(stored=True),
            # backend name (which backend is this rev stored in?)
            BACKENDNAME: ID(stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # publish time from metadata (converted to UTC datetime)
            PTIME: DATETIME(stored=True),
            # ITEMTYPE from metadata, always matched exactly hence ID
            ITEMTYPE: ID(stored=True),
            # tokenized CONTENTTYPE from metadata
            CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # SUMMARY from metadata
            SUMMARY: TEXT(stored=True),
            # DATAID from metadata
            DATAID: ID(stored=True),
            # TRASH from metadata
            TRASH: BOOLEAN(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True, spelling=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True),
            # ngram words, index ngrams of words from main content
            CONTENTNGRAM: NGRAMWORDS(minsize=3, maxsize=6),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            # Note: email / openid (if given) should be unique, but we might
            # have lots of empty values if it is not given and thus it is NOT
            # unique overall! Wrongly declaring it unique would lead to whoosh
            # killing other users from index when update_document() is called!
            EMAIL: ID(stored=True),
            OPENID: ID(stored=True),
            DISABLED: BOOLEAN(stored=True),
            LOCALE: ID(stored=True),
            SUBSCRIPTION_IDS: ID(),
            SUBSCRIPTION_PATTERNS: ID(),
        }
        latest_revs_fields.update(**userprofile_fields)

        # XXX This is a highly adhoc way to support indexing of ticket items.
        ticket_fields = {
            EFFORT: NUMERIC(stored=True),
            DIFFICULTY: NUMERIC(stored=True),
            SEVERITY: NUMERIC(stored=True),
            PRIORITY: NUMERIC(stored=True),
            ASSIGNED_TO: ID(stored=True),
            SUPERSEDED_BY: ID(stored=True),
            DEPENDS_ON: ID(stored=True),
            CLOSED: BOOLEAN(stored=True),
        }
        latest_revs_fields.update(**ticket_fields)

        blog_entry_fields = {
        }
        latest_revs_fields.update(**blog_entry_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema

        # what fields could whoosh result documents have (no matter whether all revs index
        # or latest revs index):
        self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
コード例 #3
0
ファイル: search.py プロジェクト: tiramiseb/awesomeshop
def init_indexes_and_parsers():
    path = app.config['SEARCH_INDEX_PATH']
    # Initialize the documentations index
    name = 'doc'
    if exists_in(path, indexname=name):
        indexes['doc'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        )
        schema.add(
                'title_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add('text_*', TEXT(analyzer=domotego_analyzer), glob=True)
        indexes['doc'] = create_in(path, schema, indexname=name)
        index_docs(Page.objects(pagetype='doc'))
    # Initialize the categories index
    name = 'category'
    if exists_in(path, indexname=name):
        indexes['category'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        )
        schema.add(
                'name_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add(
                'description_*',
                TEXT(analyzer=domotego_analyzer),
                glob=True
                )
        indexes['category'] = create_in(path, schema, indexname=name)
        index_categories(Category.objects)
    # Initialize the products index
    name = 'product'
    if exists_in(path, indexname=name):
        indexes['product'] = open_dir(path, indexname=name)
    else:
        try:
            os.makedirs(path)
        except OSError:
            pass
        schema = Schema(
                        id=ID(stored=True, unique=True),
                        reference=KEYWORD,
                        keywords=KEYWORD(lowercase=True, field_boost=1.5)
                        )
        schema.add(
                'name_*',
                TEXT(field_boost=2.0, analyzer=domotego_analyzer),
                glob=True
                )
        schema.add(
                'description_*',
                TEXT(analyzer=domotego_analyzer),
                glob=True
                )
        indexes['product'] = create_in(path, schema, indexname=name)
        index_products(BaseProduct.objects)

    # Initialize the parsers
    docparserfields = []
    categoryparserfields = []
    productparserfields = ['reference', 'keywords']
    for lg in app.config['LANGS']:
        docparserfields.append('title_'+lg)
        docparserfields.append('text_'+lg)
        categoryparserfields.append('name_'+lg)
        categoryparserfields.append('description_'+lg)
        productparserfields.append('name_'+lg)
        productparserfields.append('description_'+lg)
    parsers['doc'] = qparser.MultifieldParser(
                                    docparserfields,
                                    schema=indexes['doc'].schema,
                                    termclass=FuzzierTerm
                                    )
    parsers['category'] = qparser.MultifieldParser(
                                    categoryparserfields,
                                    schema=indexes['category'].schema,
                                    termclass=FuzzierTerm
                                    )
    parsers['product'] = qparser.MultifieldParser(
                                    productparserfields,
                                    schema=indexes['product'].schema,
                                    termclass=FuzzierTerm
                                    )
コード例 #4
0
ファイル: indexer.py プロジェクト: cozy/cozy-data-indexer
class IndexSchema():
    """
    Init schema and build a custom analyzer.

    All data to index will be put inside the
    """

    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
            StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")

        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}

    def update_doctypes_schema(self, schemaToUpdate):
        """
        Updates and persists the doctypes schema in its file
        """
        self.doctypesSchema.update(schemaToUpdate)

        with open('doctypes/doctypes_schema.json', 'w') as fileObject:
            fileObject.write(json.dumps(self.doctypesSchema))

    def clear_index(self):
        """
        Clear index: whoosh indexe create, create a new index in the directory
        even if an index exists.
        """

        if os.path.exists("indexes"):
            index.create_in("indexes", self.schema)

        if os.path.exists("doctypes"):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")
コード例 #5
0
writer.delete_field("path")
# Don't do this!!!
writer.add_field("path", fields.KEYWORD)
(Whoosh将来的版本可能会自动处理这个错误)

Dynamic fields



动态fields可以使用通配符名字将field联系起来
可以使用add()方法(关键字参数glob为真)添加dynamic fields到一个新的schema:
[python] view plain copy
schema = fields.Schema(...)
# Any name ending in "_d" will be treated as a stored
# DATETIME field
schema.add("*_d", fields.DATETIME(stored=True), glob=True)
在一个已经存在的索引上面设置dynamic fields,使用indexWriter.add_field方法就像你添加一个通常的field一样,保证glob参数为True
[python] view plain copy
writer = ix.writer()
writer.add_field("*_d", fields.DATETIME(stored=True), glob=True)
writer.commit()
删除一个dynamic fields可以使用IndexWriter.remove_field()方法(用glob作为名字)
[html] view plain copy
writer = ix.writer()
writer.remove_field("*_d")
writer.commit()
例如。为了使document包含以_id结尾的任意field名字,并且将他与所有的IDfield类型联系起来:
[python] view plain copy
schema = fields.Schema(path=fields.ID)
schema.add("*_id", fields.ID, glob=True)
コード例 #6
0
class WhooshStore(SAMLStoreBase):
    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in ATTRS.keys():
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in info.pop('entity_attributes').items():
                info[a] = v
        for a, v in info.items():
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in info.keys():
            if not a in self.schema.names():
                del info[a]

        for a, v in info.items():
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return self._collections

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield ATTRS[n]

    def attributes(self):
        return list(self._attributes())

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return list(searcher.lexicon(n))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return self.objects.values()
            else:
                return self.infos.values()

        from whoosh.qparser import QueryParser
        #import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in ATTRS_INV.items():
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return list(lst)
コード例 #7
0
ファイル: store.py プロジェクト: mrvanes/pyFFplus
class RedisWhooshStore(SAMLStoreBase
                       ):  # TODO: This needs a gc mechanism for keys (uuids)
    def json_dict(self, name):
        return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name),
                                     redis=self._redis,
                                     writeback=True),
                            maxsize=config.cache_size)

    def xml_dict(self, name):
        return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name),
                                    redis=self._redis,
                                    writeback=True),
                            maxsize=config.cache_size)

    def __init__(self, *args, **kwargs):
        self._dir = kwargs.pop('directory', '.whoosh')
        clear = bool(kwargs.pop('clear', config.store_clear))
        self._name = kwargs.pop('name', config.store_name)
        self._redis = kwargs.pop('redis', redis())
        if clear:
            shutil.rmtree(self._dir)
        now = datetime.now()
        self._last_index_time = now
        self._last_modified = now
        self._setup()
        if clear:
            self.reset()

    def _setup(self):
        self._redis = getattr(self, '_redis', None)
        if not self._redis:
            self._redis = redis(
            )  # XXX test cases won't get correctly unpicked because of this
        self.schema = Schema(content=NGRAMWORDS(stored=False))
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        self.schema.add('sha1', ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self.objects = self.xml_dict('objects')
        self.parts = self.json_dict('parts')
        self.storage = FileStorage(os.path.join(self._dir, self._name))
        try:
            self.index = self.storage.open_index(schema=self.schema)
        except BaseException as ex:
            log.warn(ex)
            self.storage.create()
            self.index = self.storage.create_index(self.schema)
            self._reindex()

    def __getstate__(self):
        state = dict()
        for p in ('_dir', '_name', '_last_index_time', '_last_modified'):
            state[p] = getattr(self, p)
        return state

    def __setstate__(self, state):
        self.__dict__.update(state)
        self._setup()

    def __call__(self, *args, **kwargs):
        watched = kwargs.pop('watched', None)
        scheduler = kwargs.pop('scheduler', None)
        if watched is not None and scheduler is not None:
            super(RedisWhooshStore, self).__call__(watched=watched,
                                                   scheduler=scheduler)
            log.debug("indexing using {}".format(scheduler))
            if scheduler is not None:  # and self._last_modified > self._last_index_time and :
                scheduler.add_job(RedisWhooshStore._reindex,
                                  args=[self],
                                  max_instances=1,
                                  coalesce=True,
                                  misfire_grace_time=2 *
                                  config.update_frequency)

    def _reindex(self):
        log.debug("indexing the store...")
        self._last_index_time = datetime.now()
        seen = set()
        refs = set([b2u(s) for s in self.objects.keys()])
        parts = self.parts.values()
        for ref in refs:
            for part in parts:
                if ref in part['items']:
                    seen.add(ref)

        ix = self.storage.open_index()
        lock = ix.lock("reindex")
        try:
            log.debug("waiting for index lock")
            lock.acquire(True)
            log.debug("got index lock")
            with ix.writer() as writer:
                for ref in refs:
                    if ref not in seen:
                        log.debug("removing unseen ref {}".format(ref))
                        del self.objects[ref]
                        del self.parts[ref]

                log.debug("updating index")
                for e in self.objects.values():
                    info = self._index_prep(entity_simple_info(e))
                    ref = object_id(e)
                    writer.add_document(object_id=ref, **info)

                writer.mergetype = CLEAR
        finally:
            try:
                log.debug("releasing index lock")
                lock.release()
            except ThreadError as ex:
                pass

    def dump(self):
        ix = self.storage.open_index()
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        res = dict()
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v

        content = " ".join(
            filter(lambda x: x is not None, [
                info.get(x, '') for x in ('service_name', 'title', 'domain',
                                          'keywords', 'scopes')
            ]))
        res['content'] = content.strip()
        for a, v in info.items():
            k = a
            if a in ATTRS_INV:
                k = ATTRS_INV[a]

            if k in self.schema.names():
                if type(v) in (list, tuple):
                    res[k] = " ".join([vv.lower() for vv in v])
                elif type(v) in six.string_types:
                    res[k] = info[a].lower()
        res['sha1'] = hash_id(info['entity_id'], prefix=False)
        return res

    def update(self, t, tid=None, etag=None, lazy=True):
        relt = root(t)
        assert (relt is not None)

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            ref = object_id(relt)
            parts = None
            if ref in self.parts:
                parts = self.parts[ref]
            if etag is not None and (parts is None
                                     or parts.get('etag', None) != etag):
                self.parts[ref] = {
                    'id': relt.get('entityID'),
                    'etag': etag,
                    'count': 1,
                    'items': [ref]
                }
                self.objects[ref] = relt
                self._last_modified = datetime.now()
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            if etag is None:
                etag = hex_digest(dumptree(t, pretty_print=False), 'sha256')
            parts = None
            if tid in self.parts:
                parts = self.parts[tid]
            if parts is None or parts.get('etag', None) != etag:
                items = set()
                for e in iter_entities(t):
                    ref = object_id(e)
                    items.add(ref)
                    self.objects[ref] = e
                self.parts[tid] = {
                    'id': tid,
                    'count': len(items),
                    'etag': etag,
                    'items': list(items)
                }
                self._last_modified = datetime.now()

        if not lazy:
            self._reindex()

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def collections(self):
        return [b2u(ref) for ref in self.parts.keys()]

    def reset(self):
        for k in ('{}_{}'.format(self._name, 'parts'),
                  '{}_{}'.format(self._name, 'objects')):
            self._redis.delete('{}_{}'.format(self._name, 'parts'))
            self._redis.delete('{}_{}'.format(self._name, 'objects'))

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def _prep_key(self, key):
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        key = key.replace('-', ' AND NOT ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        return key

    def _entities(self):
        lst = set()
        for ref_data in self.parts.values():
            for ref in ref_data['items']:
                e = self.objects.get(ref, None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def lookup(self, key):
        if key == 'entities' or key is None:
            return self._entities()

        bkey = six.b(key)
        if bkey in self.objects:
            return [self.objects.get(bkey)]

        if bkey in self.parts:
            res = []
            part = self.parts.get(bkey)
            for item in part['items']:
                res.extend(self.lookup(item))
            return res

        key = self._prep_key(key)
        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                e = self.objects.get(result['object_id'], None)
                if e is not None:
                    lst.add(e)

        return b2u(list(lst))

    @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size)
    def search(self, query=None, path=None, entity_filter=None, related=None):
        if entity_filter:
            query = "{!s} AND {!s}".format(query, entity_filter)
        query = self._prep_key(query)
        qp = MultifieldParser(['content', 'domain'], schema=self.schema)
        q = qp.parse(query)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            log.debug(results)
            for result in results:
                lst.add(result['object_id'])

        res = list()
        for ref in lst:
            e = self.objects.get(ref, None)
            if e is not None:
                res.append(discojson(e))
        return res
コード例 #8
0
ファイル: indexing.py プロジェクト: pombredanne/moin2
    def __init__(self, index_dir, backend, wiki_name=None, acl_rights_contents=[], **kw):
        """
        Store params, create schemas.
        """
        self.index_dir = index_dir
        self.index_dir_tmp = index_dir + '.temp'
        self.backend = backend
        self.wikiname = wiki_name
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # parent revision id
            PARENTID: ID(stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # tokenized CONTENTTYPE from metadata
            CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # SUMMARY from metadata
            SUMMARY: TEXT(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            ACL: TEXT(analyzer=AclTokenizer(acl_rights_contents), multitoken_query="and", stored=True),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            # Note: email / openid (if given) should be unique, but we might
            # have lots of empty values if it is not given and thus it is NOT
            # unique overall! Wrongly declaring it unique would lead to whoosh
            # killing other users from index when update_document() is called!
            EMAIL: ID(stored=True),
            OPENID: ID(stored=True),
        }
        latest_revs_fields.update(**userprofile_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema

        # what fields could whoosh result documents have (no matter whether all revs index
        # or latest revs index):
        self.common_fields = set(latest_revs_fields.keys()) & set(all_revs_fields.keys())
コード例 #9
0
    def _build_doc_attrs(self, model_class: Type[Model],
                         schema: Schema) -> None:
        mapper = sa.inspect(model_class)

        args = self.doc_attrs
        # Any field not in schema will be stored here.
        # After all field have been discovered, we add the missing ones.
        field_definitions = {}

        def setup_field(
                attr_name: str, field_name: Union[Tuple[str, Union[type, ID]],
                                                  str]) -> None:
            field_def = False
            if not isinstance(field_name, str):
                field_name, field_def = field_name

            if field_name not in schema:
                if (field_name not in field_definitions
                        or field_definitions[field_name] is False):
                    field_definitions[field_name] = field_def

            # attrgetter offers dotted name support. Useful for attributes on
            # related objects.
            args.setdefault(field_name, {})[attr_name] = attrgetter(attr_name)

        # model level definitions
        for name, field_names in self.index_to:
            if isinstance(field_names, str):
                field_names = (field_names, )
            for field_name in field_names:
                setup_field(name, field_name)

        # per column definitions
        for col in mapper.columns:
            name = col.name
            info = col.info

            if not info.get("searchable"):
                continue

            index_to = info.get("index_to", (name, ))
            if isinstance(index_to, str):
                index_to = (index_to, )

            for field_name in index_to:
                setup_field(name, field_name)

        # add missing fields to schema
        for field_name, field_def in field_definitions.items():
            if field_name in schema:
                continue

            if field_def is False:
                field_def = TEXT(stored=True, analyzer=accent_folder)

            logger.debug(
                "Adding field to schema:\n"
                "  Model: %s\n"
                '  Field: "%s" %s',
                model_class._object_type(),
                field_name,
                field_def,
            )
            schema.add(field_name, field_def)
コード例 #10
0
ファイル: store.py プロジェクト: leifj/pyFF
class WhooshStore(SAMLStoreBase):

    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v
        for a, v in list(info.items()):
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in list(info.keys()):
            if a not in self.schema.names():
                del info[a]

        for a, v in list(info.items()):
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return b2u(self._collections)

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(list(self.objects.keys()))
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))
コード例 #11
0
    def __init__(self, index_dir, backend, user_name=None, acl_support=False, **kw):
        """
        Store params, create schemas.
        """
        self.index_dir = index_dir
        self.index_dir_tmp = index_dir + '.temp'
        self.backend = backend
        self.user_name = user_name # TODO use currently logged-in username
        self.acl_support = acl_support
        self.wikiname = u'' # TODO take from app.cfg.interwikiname
        self.ix = {}  # open indexes
        self.schemas = {}  # existing schemas

        common_fields = {
            # wikiname so we can have a shared index in a wiki farm, always check this!
            WIKINAME: ID(stored=True),
            # tokenized NAME from metadata - use this for manual searching from UI
            # TODO was: NAME: TEXT(stored=True, multitoken_query="and", analyzer=item_name_analyzer(), field_boost=2.0),
            NAME: ID(stored=True, field_boost=2.0),
            # unmodified NAME from metadata - use this for precise lookup by the code.
            # also needed for wildcard search, so the original string as well as the query
            # (with the wildcard) is not cut into pieces.
            NAME_EXACT: ID(field_boost=3.0),
            # revision id (aka meta id)
            REVID: ID(unique=True, stored=True),
            # MTIME from revision metadata (converted to UTC datetime)
            MTIME: DATETIME(stored=True),
            # tokenized CONTENTTYPE from metadata
            # TODO was: CONTENTTYPE: TEXT(stored=True, multitoken_query="and", analyzer=MimeTokenizer()),
            CONTENTTYPE: ID(stored=True),
            # unmodified list of TAGS from metadata
            TAGS: ID(stored=True),
            LANGUAGE: ID(stored=True),
            # USERID from metadata TODO: -> user ITEMID
            USERID: ID(stored=True),
            # ADDRESS from metadata
            ADDRESS: ID(stored=True),
            # HOSTNAME from metadata
            HOSTNAME: ID(stored=True),
            # SIZE from metadata
            SIZE: NUMERIC(stored=True),
            # ACTION from metadata
            ACTION: ID(stored=True),
            # tokenized COMMENT from metadata
            COMMENT: TEXT(stored=True),
            # data (content), converted to text/plain and tokenized
            CONTENT: TEXT(stored=True),
        }

        latest_revs_fields = {
            # ITEMID from metadata - as there is only latest rev of same item here, it is unique
            ITEMID: ID(unique=True, stored=True),
            # unmodified list of ITEMLINKS from metadata
            ITEMLINKS: ID(stored=True),
            # unmodified list of ITEMTRANSCLUSIONS from metadata
            ITEMTRANSCLUSIONS: ID(stored=True),
            # tokenized ACL from metadata
            # TODO was: ACL: TEXT(analyzer=AclTokenizer(self._cfg), multitoken_query="and", stored=True),
            ACL: ID(stored=True),
        }
        latest_revs_fields.update(**common_fields)

        userprofile_fields = {
            EMAIL: ID(unique=True, stored=True),
            OPENID: ID(unique=True, stored=True),
        }
        latest_revs_fields.update(**userprofile_fields)

        all_revs_fields = {
            ITEMID: ID(stored=True),
        }
        all_revs_fields.update(**common_fields)

        latest_revisions_schema = Schema(**latest_revs_fields)
        all_revisions_schema = Schema(**all_revs_fields)

        # Define dynamic fields
        dynamic_fields = [("*_id", ID(stored=True)),
                          ("*_text", TEXT(stored=True)),
                          ("*_keyword", KEYWORD(stored=True)),
                          ("*_numeric", NUMERIC(stored=True)),
                          ("*_datetime", DATETIME(stored=True)),
                          ("*_boolean", BOOLEAN(stored=True)),
                         ]

        # Adding dynamic fields to schemas
        for glob, field_type in dynamic_fields:
            latest_revisions_schema.add(glob, field_type, glob=True)
            all_revisions_schema.add(glob, field_type, glob=True)

        # schemas are needed by query parser and for index creation
        self.schemas[ALL_REVS] = all_revisions_schema
        self.schemas[LATEST_REVS] = latest_revisions_schema
コード例 #12
0
class IndexConfig:
    def __init__(self, config_dict):
        self.__index_config_dict = config_dict

        self.__schema = Schema()

        try:
            for field_name in self.__index_config_dict['schema'].keys():
                field_type = self.__get_field_type(
                    self.__index_config_dict['schema'][field_name]
                    ['field_type'])
                for arg in self.__index_config_dict['schema'][field_name][
                        'args'].keys():
                    setattr(
                        field_type, arg, self.__index_config_dict['schema']
                        [field_name]['args'][arg])
                self.__schema.add(field_name, field_type, glob=False)

            if not self.__validate():
                raise ValueError('invalid schema')
        except Exception as ex:
            raise ex

    def __get_filter(self, name):
        class_name = self.__index_config_dict['filters'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['filters'][name]:
            class_args = deepcopy(
                self.__index_config_dict['filters'][name]['args'])

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_tokenizer(self, name):
        class_name = self.__index_config_dict['tokenizers'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['tokenizers'][name]:
            class_args = deepcopy(
                self.__index_config_dict['tokenizers'][name]['args'])

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_analyzer(self, name):
        instance = None

        if 'class' in self.__index_config_dict['analyzers'][name]:
            class_name = self.__index_config_dict['analyzers'][name]['class']
            class_args = {}
            if 'args' in self.__index_config_dict['analyzers'][name]:
                class_args = deepcopy(
                    self.__index_config_dict['analyzers'][name]['args'])

            instance = get_instance(class_name, **class_args)
        elif 'tokenizer' in self.__index_config_dict['analyzers'][name]:
            instance = self.__get_tokenizer(
                self.__index_config_dict['analyzers'][name]['tokenizer'])
            if 'filters' in self.__index_config_dict['analyzers'][name]:
                for filter_name in self.__index_config_dict['analyzers'][name][
                        'filters']:
                    instance = instance | self.__get_filter(filter_name)

        return instance

    def __get_field_type(self, name):
        class_name = self.__index_config_dict['field_types'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['field_types'][name]:
            class_args = deepcopy(
                self.__index_config_dict['field_types'][name]['args'])
            if 'analyzer' in class_args:
                class_args['analyzer'] = self.__get_analyzer(
                    class_args['analyzer']) if class_args['analyzer'] else None
            if 'tokenizer' in class_args:
                class_args['tokenizer'] = self.__get_tokenizer(
                    class_args['tokenizer']
                ) if class_args['tokenizer'] else None

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_unique_fields(self):
        return [name for name, field in self.__schema.items() if field.unique]

    def __validate(self):
        valid = False

        if len(self.__get_unique_fields()) == 1:
            valid = True

        return valid

    def get_schema(self):
        return self.__schema

    def get_doc_id_field(self):
        return self.__get_unique_fields()[0]

    def get_storage_type(self):
        try:
            storage_type = self.__index_config_dict['storage']['type']
        except KeyError:
            storage_type = 'file'

        return storage_type

    def get_writer_processors(self):
        try:
            procs = self.__index_config_dict['writer']['processors']
        except KeyError:
            procs = 1

        return procs

    def get_writer_batch_size(self):
        try:
            batch_size = self.__index_config_dict['writer']['batch_size']
        except KeyError:
            batch_size = 100

        return batch_size

    def get_writer_multi_segment(self):
        try:
            multi_segment = self.__index_config_dict['writer']['multi_segment']
        except KeyError:
            multi_segment = False

        return multi_segment

    def get_writer_auto_commit_period(self):
        try:
            period = self.__index_config_dict['writer']['auto_commit'][
                'period']
        except KeyError:
            period = 0
        return period

    def get_writer_auto_commit_limit(self):
        try:
            limit = self.__index_config_dict['writer']['auto_commit']['limit']
        except KeyError:
            limit = 10
        return limit
コード例 #13
0
writer.delete_field("path")
# Don't do this!!!
writer.add_field("path", fields.KEYWORD)
(Whoosh将来的版本可能会自动处理这个错误)

Dynamic fields



动态fields可以使用通配符名字将field联系起来
可以使用add()方法(关键字参数glob为真)添加dynamic fields到一个新的schema:
[python] view plain copy
schema = fields.Schema(...)
# Any name ending in "_d" will be treated as a stored
# DATETIME field
schema.add("*_d", fields.DATETIME(stored=True), glob=True)
在一个已经存在的索引上面设置dynamic fields,使用indexWriter.add_field方法就像你添加一个通常的field一样,保证glob参数为True
[python] view plain copy
writer = ix.writer()
writer.add_field("*_d", fields.DATETIME(stored=True), glob=True)
writer.commit()
删除一个dynamic fields可以使用IndexWriter.remove_field()方法(用glob作为名字)
[html] view plain copy
writer = ix.writer()
writer.remove_field("*_d")
writer.commit()
例如。为了使document包含以_id结尾的任意field名字,并且将他与所有的IDfield类型联系起来:
[python] view plain copy
schema = fields.Schema(path=fields.ID)
schema.add("*_id", fields.ID, glob=True)
コード例 #14
0
ファイル: indexer.py プロジェクト: obigroup/cozy-data-indexer
class IndexSchema():
    """
    Init schema and build a custom analyzer.

    All data to index will be put inside the
    """
    def __init__(self):

        chfilter = CharsetFilter(accent_map)
        stoplist = stoplists["en"].union(stoplists["fr"])
        analyzer = RegexTokenizer() | LowercaseFilter() | \
                   StopFilter(stoplist=stoplist) | chfilter

        # defines the schema
        # see http://pythonhosted.org/Whoosh/schema.html for reference
        keywordType = KEYWORD(lowercase=True, scorable=True)
        self.schema = Schema(content=TEXT(analyzer=analyzer),
                             docType=TEXT,
                             docId=ID(stored=True, unique=True),
                             tags=keywordType)

        # Adds dynamic fields so each documents can index its fields in the
        # same Whoosh index
        self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True)
        self.schema.add('*_date', DATETIME, glob=True)
        self.schema.add('*_number', NUMERIC, glob=True)
        self.schema.add('*_boolean', BOOLEAN, glob=True)

        # Creates the index folder and Whoosh index files if it doesn't exist
        # And loads the index in any case
        if not os.path.exists("indexes"):
            os.mkdir("indexes")
            self.index = index.create_in("indexes", self.schema)
        else:
            self.index = index.open_dir("indexes")

        # Creates the doctypes folder if it doesn't exist
        if not os.path.exists("doctypes"):
            os.mkdir("doctypes")

        # Creates the doctypes default schema file if it doesn't exist
        if not os.path.exists('doctypes/doctypes_schema.json'):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")
        '''
        Loads the doctypes schema if it's valid, otherwise recreates it
        Doctypes schema is a dictionary of doctypes with their fields created
        and updated when a document is indexed.
        That way, we can tell Whoosh which fields to search by default, because
        there is apparently no way to say "search in all fields".
        '''
        with open('doctypes/doctypes_schema.json', 'r+') as rawJSON:
            try:
                self.doctypesSchema = json.load(rawJSON)
            except ValueError:
                rawJSON.write("{}")
                self.doctypesSchema = {}

    def update_doctypes_schema(self, schemaToUpdate):
        """
        Updates and persists the doctypes schema in its file
        """
        self.doctypesSchema.update(schemaToUpdate)

        with open('doctypes/doctypes_schema.json', 'w') as fileObject:
            fileObject.write(json.dumps(self.doctypesSchema))

    def clear_index(self):
        """
        Clear index: whoosh indexe create, create a new index in the directory
        even if an index exists.
        """

        if os.path.exists("indexes"):
            index.create_in("indexes", self.schema)

        if os.path.exists("doctypes"):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")