Example #1
0
def test_stored_fields2():
    schema = fields.Schema(
        content=fields.TEXT(stored=True),
        title=fields.TEXT(stored=True),
        summary=fields.STORED,
        path=fields.ID(stored=True),
        helpid=fields.KEYWORD,
        parent=fields.KEYWORD,
        context=fields.KEYWORD(stored=True),
        type=fields.KEYWORD(stored=True),
        status=fields.KEYWORD(stored=True),
        superclass=fields.KEYWORD(stored=True),
        exampleFor=fields.KEYWORD(stored=True),
        chapter=fields.KEYWORD(stored=True),
        replaces=fields.KEYWORD,
        time=fields.STORED,
        methods=fields.STORED,
        exampleFile=fields.STORED,
    )

    storedkeys = [
        "chapter", "content", "context", "exampleFile", "exampleFor",
        "methods", "path", "status", "summary", "superclass", "time", "title",
        "type"
    ]
    assert_equal(storedkeys, schema.stored_names())

    st = RamStorage()
    ix = st.create_index(schema)

    writer = ix.writer()
    writer.add_document(content=u("Content of this document."),
                        title=u("This is the title"),
                        summary=u("This is the summary"),
                        path=u("/main"))
    writer.add_document(content=u("Second document."),
                        title=u("Second title"),
                        summary=u("Summary numero due"),
                        path=u("/second"))
    writer.add_document(content=u("Third document."),
                        title=u("Title 3"),
                        summary=u("Summary treo"),
                        path=u("/san"))
    writer.commit()
    ix.close()

    ix = st.open_index()
    with ix.searcher() as s:
        doc = s.document(path="/main")
        assert doc is not None
        assert ([doc[k] for k in sorted(doc.keys())] == [
            "Content of this document.", "/main", "This is the summary",
            "This is the title"
        ])

    ix.close()
Example #2
0
def test_pystemmer():
    ana = (analysis.RegexTokenizer()
           | analysis.LowercaseFilter()
           | analysis.PyStemmerFilter())
    schema = fields.Schema(text=fields.TEXT(analyzer=ana))
    st = RamStorage()

    ix = st.create_index(schema)
    with ix.writer() as w:
        w.add_document(text=u("rains falling strangely"))

    ix = st.open_index()
    with ix.writer() as w:
        w.add_document(text=u("pains stalling strongly"))

    ix = st.open_index()
    with ix.reader() as r:
        assert_equal(list(r.lexicon("text")),
                     ["fall", "pain", "rain", "stall", "strang", "strong"])
Example #3
0
def test_persistent_cache():
    schema = fields.Schema(id=fields.ID(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)
    with ix.writer() as w:
        for term in u("charlie alfa echo bravo delta").split():
            w.add_document(id=term)

    ix = st.open_index()
    with ix.reader() as r:
        _ = r.fieldcache("id")
        del _

    ix = st.open_index()
    with ix.reader() as r:
        assert r.fieldcache_available("id")
        assert not r.fieldcache_loaded("id")
        fc = r.fieldcache("id")
        assert r.fieldcache_loaded("id")
        assert_equal(list(fc.order), [3, 1, 5, 2, 4])
        assert_equal(list(fc.texts), [u('\uffff'), u'alfa', u'bravo',
                                      u'charlie', u'delta', u'echo'])
Example #4
0
def test_persistent_cache():
    schema = fields.Schema(id=fields.ID(stored=True))
    st = RamStorage()
    ix = st.create_index(schema)
    with ix.writer() as w:
        for term in u("charlie alfa echo bravo delta").split():
            w.add_document(id=term)

    ix = st.open_index()
    with ix.reader() as r:
        _ = r.fieldcache("id")
        del _

    ix = st.open_index()
    with ix.reader() as r:
        assert r.fieldcache_available("id")
        assert not r.fieldcache_loaded("id")
        fc = r.fieldcache("id")
        assert r.fieldcache_loaded("id")
        assert_equal(list(fc.order), [3, 1, 5, 2, 4])
        assert_equal(
            list(fc.texts),
            [u('\uffff'), u'alfa', u'bravo', u'charlie', u'delta', u'echo'])
Example #5
0
def test_stored_fields2():
    schema = fields.Schema(content=fields.TEXT(stored=True),
                           title=fields.TEXT(stored=True),
                           summary=fields.STORED,
                           path=fields.ID(stored=True),
                           helpid=fields.KEYWORD,
                           parent=fields.KEYWORD,
                           context=fields.KEYWORD(stored=True),
                           type=fields.KEYWORD(stored=True),
                           status=fields.KEYWORD(stored=True),
                           superclass=fields.KEYWORD(stored=True),
                           exampleFor=fields.KEYWORD(stored=True),
                           chapter=fields.KEYWORD(stored=True),
                           replaces=fields.KEYWORD,
                           time=fields.STORED,
                           methods=fields.STORED,
                           exampleFile=fields.STORED,
                           )
    
    storedkeys = ["chapter", "content", "context", "exampleFile",
                  "exampleFor", "methods", "path", "status", "summary",
                  "superclass", "time", "title", "type"]
    assert_equal(storedkeys, schema.stored_names())
    
    st = RamStorage()
    ix = st.create_index(schema)
    
    writer = ix.writer()
    writer.add_document(content=u("Content of this document."),
                        title=u("This is the title"),
                        summary=u("This is the summary"), path=u("/main"))
    writer.add_document(content=u("Second document."), title=u("Second title"),
                        summary=u("Summary numero due"), path=u("/second"))
    writer.add_document(content=u("Third document."), title=u("Title 3"),
                        summary=u("Summary treo"), path=u("/san"))
    writer.commit()
    ix.close()
    
    ix = st.open_index()
    with ix.searcher() as s:
        doc = s.document(path="/main")
        assert ([doc[k] for k in sorted(doc.keys())]
                == ["Content of this document.", "/main",
                    "This is the summary", "This is the title"])
    
    ix.close()
Example #6
0
class WhooshStore(SAMLStoreBase):
    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in ATTRS.keys():
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in info.pop('entity_attributes').items():
                info[a] = v
        for a, v in info.items():
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in info.keys():
            if not a in self.schema.names():
                del info[a]

        for a, v in info.items():
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return self._collections

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(self.objects.keys())
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield ATTRS[n]

    def attributes(self):
        return list(self._attributes())

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return list(searcher.lexicon(n))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return self.objects.values()
            else:
                return self.infos.values()

        from whoosh.qparser import QueryParser
        #import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in ATTRS_INV.items():
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return list(lst)
Example #7
0
File: store.py Project: leifj/pyFF
class WhooshStore(SAMLStoreBase):

    def __init__(self):
        self.schema = Schema(scopes=KEYWORD(),
                             descr=TEXT(),
                             service_name=TEXT(),
                             service_descr=TEXT(),
                             keywords=KEYWORD())
        self.schema.add("object_id", ID(stored=True, unique=True))
        self.schema.add("entity_id", ID(stored=True, unique=True))
        for a in list(ATTRS.keys()):
            self.schema.add(a, KEYWORD())
        self._collections = set()
        from whoosh.filedb.filestore import RamStorage, FileStorage
        self.storage = RamStorage()
        self.storage.create()
        self.index = self.storage.create_index(self.schema)
        self.objects = dict()
        self.infos = dict()

    def dump(self):
        ix = self.storage.open_index()
        print(ix.schema)
        from whoosh.query import Every
        with ix.searcher() as searcher:
            for result in ix.searcher().search(Every('object_id')):
                print(result)

    def _index_prep(self, info):
        if 'entity_attributes' in info:
            for a, v in list(info.pop('entity_attributes').items()):
                info[a] = v
        for a, v in list(info.items()):
            if type(v) is not list and type(v) is not tuple:
                info[a] = [info.pop(a)]

            if a in ATTRS_INV:
                info[ATTRS_INV[a]] = info.pop(a)

        for a in list(info.keys()):
            if a not in self.schema.names():
                del info[a]

        for a, v in list(info.items()):
            info[a] = [six.text_type(vv) for vv in v]

    def _index(self, e, tid=None):
        info = entity_info(e)
        if tid is not None:
            info['collection_id'] = tid
        self._index_prep(info)
        id = six.text_type(object_id(e))
        # mix in tid here
        self.infos[id] = info
        self.objects[id] = e
        ix = self.storage.open_index()
        with ix.writer() as writer:
            writer.add_document(object_id=id, **info)
            writer.mergetype = writing.CLEAR

    def update(self, t, tid=None, ts=None, merge_strategy=None):
        relt = root(t)
        assert (relt is not None)
        ne = 0

        if relt.tag == "{%s}EntityDescriptor" % NS['md']:
            self._index(relt)
            ne += 1
        elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']:
            if tid is None:
                tid = relt.get('Name')
            self._collections.add(tid)
            for e in iter_entities(t):
                self._index(e, tid=tid)
                ne += 1

        return ne

    def collections(self):
        return b2u(self._collections)

    def reset(self):
        self.__init__()

    def size(self, a=None, v=None):
        if a is None:
            return len(list(self.objects.keys()))
        elif a is not None and v is None:
            return len(self.attribute(a))
        else:
            return len(self.lookup("{!s}={!s}".format(a, v)))

    def _attributes(self):
        ix = self.storage.open_index()
        with ix.reader() as reader:
            for n in reader.indexed_field_names():
                if n in ATTRS:
                    yield b2u(ATTRS[n])

    def attributes(self):
        return b2u(list(self._attributes()))

    def attribute(self, a):
        if a in ATTRS_INV:
            n = ATTRS_INV[a]
            ix = self.storage.open_index()
            with ix.searcher() as searcher:
                return b2u(list(searcher.lexicon(n)))
        else:
            return []

    def lookup(self, key, raw=True, field="entity_id"):
        if key == 'entities' or key is None:
            if raw:
                return b2u(list(self.objects.values()))
            else:
                return b2u(list(self.infos.values()))

        from whoosh.qparser import QueryParser
        # import pdb; pdb.set_trace()
        key = key.strip('+')
        key = key.replace('+', ' AND ')
        for uri, a in list(ATTRS_INV.items()):
            key = key.replace(uri, a)
        key = " {!s} ".format(key)
        key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key)
        key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key)
        key = key.strip()

        qp = QueryParser("object_id", schema=self.schema)
        q = qp.parse(key)
        lst = set()
        with self.index.searcher() as searcher:
            results = searcher.search(q, limit=None)
            for result in results:
                if raw:
                    lst.add(self.objects[result['object_id']])
                else:
                    lst.add(self.infos[result['object_id']])

        return b2u(list(lst))