Ejemplo n.º 1
0
class WorkspaceSchema(fields.SchemaClass):

    id = fields.ID(stored=True, unique=True)
    owner = fields.TEXT(stored=True, spelling=True)
    name = fields.TEXT(stored=True, spelling=True)
    description = fields.NGRAM(stored=True, minsize=1, phrase=True)
    lastmodified = fields.DATETIME(stored=True)
    longdescription = fields.NGRAM(stored=True, minsize=1, phrase=True)
    public = fields.BOOLEAN(stored=True)
    users = fields.KEYWORD(commas=True)
    groups = fields.KEYWORD(commas=True)
    shared = fields.BOOLEAN(stored=True)
Ejemplo n.º 2
0
def test_boolean_strings():
    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(i=0, b="true")
        w.add_document(i=1, b="True")
        w.add_document(i=2, b="false")
        w.add_document(i=3, b="False")
        w.add_document(i=4, b=u("true"))
        w.add_document(i=5, b=u("True"))
        w.add_document(i=6, b=u("false"))
        w.add_document(i=7, b=u("False"))

    with ix.searcher() as s:
        qp = qparser.QueryParser("b", ix.schema)

        def check(qs, nums):
            q = qp.parse(qs)
            r = s.search(q, limit=None)
            assert [hit["i"] for hit in r] == nums

        trues = [0, 1, 4, 5]
        falses = [2, 3, 6, 7]
        check("true", trues)
        check("True", trues)
        check("false", falses)
        check("False", falses)
        check("t", trues)
        check("f", falses)
Ejemplo n.º 3
0
def get_schema(model, analyzer):
    schema = {}
    primary = None
    searchable = set(getattr(model, '__searchable__', []))

    for field in model.__table__.columns:
        # primary key id
        if field.primary_key:
            schema[field.name] = whoosh_fields.ID(stored=True,
                                                  unique=True,
                                                  sortable=True)
            primary = field.name

        if field.name not in searchable:
            continue

        # text types
        if isinstance(field.type, TEXT_TYPES):
            schema[field.name] = whoosh_fields.TEXT(analyzer=analyzer)

        elif isinstance(field.type, DATE_TYPES):
            is_unique = getattr(field, 'unique', False)
            schema[field.name] = whoosh_fields.DATETIME(unique=is_unique)

        elif isinstance(field.type, sql_types.Boolean):
            schema[field.name] = whoosh_fields.BOOLEAN()

        else:
            raise WhooshAlchemyError('cannot index column of type %s' %
                                     field.type)

    return whoosh_fields.Schema(**schema), primary
Ejemplo n.º 4
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert_equal(sorted([d["id"] for d in r]), ["a", "c", "e"])
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert_equal(sorted([d["id"] for d in r]), ["a", "c", "e"])
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:false"))
        assert_equal(sorted([d["id"] for d in r]), ["b", "d"])
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert_equal(sorted([d["id"] for d in r]), ["b", "d"])
        assert not any(d["done"] for d in r)
Ejemplo n.º 5
0
def test_boolean():
    schema = fields.Schema(id=fields.ID(stored=True),
                           done=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)

    w = ix.writer()
    w.add_document(id=u("a"), done=True)
    w.add_document(id=u("b"), done=False)
    w.add_document(id=u("c"), done=True)
    w.add_document(id=u("d"), done=False)
    w.add_document(id=u("e"), done=True)
    w.commit()

    with ix.searcher() as s:
        qp = qparser.QueryParser("id", schema)

        r = s.search(qp.parse("done:true"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        r = s.search(qp.parse("done:yes"))
        assert sorted([d["id"] for d in r]) == ["a", "c", "e"]
        assert all(d["done"] for d in r)

        q = qp.parse("done:false")
        assert q.__class__ == query.Term
        assert q.text is False
        assert schema["done"].to_bytes(False) == b("f")
        r = s.search(q)
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)

        r = s.search(qp.parse("done:no"))
        assert sorted([d["id"] for d in r]) == ["b", "d"]
        assert not any(d["done"] for d in r)
Ejemplo n.º 6
0
    def create_index(self):
        if not os.path.exists("twitter_index"):
            os.mkdir("twitter_index")


        schema = fields.Schema(tweet_id=fields.TEXT(stored=True),
                                batch=fields.NUMERIC(stored=True),
                                content=fields.TEXT(stored=True),
                                posted=fields.DATETIME(stored=True),
                                owner_sn=fields.TEXT(stored=True),
                                owner_id=fields.TEXT(stored=True),
                                owner_name=fields.TEXT(stored=True),
                                isRT=fields.BOOLEAN(stored=True),
                                timesRT=fields.NUMERIC(stored=True),
                                timesFav= fields.NUMERIC(stored=True),
                                orig_timesRT=fields.NUMERIC(stored=True),
                                orig_timesFav= fields.NUMERIC(stored=True),
                                hashtags=fields.KEYWORD(stored=True),
                                orgnlTweet = fields.TEXT(stored=True),
                                mentions=fields.KEYWORD(stored=True),
                                media = fields.TEXT(stored=True),
                                url = fields.TEXT(stored=True),
                                liwc=fields.TEXT(stored=True))


        self.INDEX = index.create_in("twitter_index", schema, indexname="TWTTR")
        print("New searching index succesfully created")

        return self.INDEX
class TweetSchema(fields.SchemaClass):
    id = fields.ID(stored=True, unique=True)
    url = fields.ID(stored=True, unique=True)

    text = fields.TEXT(stored=True)
    source = fields.TEXT(stored=True)

    reply = fields.BOOLEAN(stored=True)
    in_reply_to_id = fields.TEXT(stored=True)
    in_reply_to_name = fields.TEXT(stored=True)

    user_mentions = fields.KEYWORD(stored=True)
    hashtags = fields.KEYWORD(stored=True)
    urls = fields.KEYWORD(stored=True)

    geo = fields.BOOLEAN(stored=True)
    latitude = fields.NUMERIC(stored=True)
    longitude = fields.NUMERIC(stored=True)

    date = fields.DATETIME(stored=True)
Ejemplo n.º 8
0
def test_boolean3():
    schema = fields.Schema(t=fields.TEXT(stored=True, field_boost=5),
                           b=fields.BOOLEAN(stored=True),
                           c=fields.TEXT)
    ix = RamStorage().create_index(schema)

    with ix.writer() as w:
        w.add_document(t=u("with hardcopy"), b=True, c=u("alfa"))
        w.add_document(t=u("no hardcopy"), b=False, c=u("bravo"))

    with ix.searcher() as s:
        q = query.Term("b", schema["b"].to_bytes(True))
        ts = [hit["t"] for hit in s.search(q)]
        assert ts == ["with hardcopy"]
Ejemplo n.º 9
0
def test_boolean_find_deleted():
    # "Random" string of ones and zeros representing deleted and undeleted
    domain = "1110001010001110010101000101001011101010001011111101000101010101"

    schema = fields.Schema(i=fields.STORED, b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    count = 0
    # Create multiple segments just in case
    for _ in xrange(5):
        w = ix.writer()
        for c in domain:
            w.add_document(i=count, b=(c == "1"))
        w.commit(merge=False)

    # Delete documents where "b" is True
    with ix.writer() as w:
        w.delete_by_term("b", "t")

    with ix.searcher() as s:
        # Double check that documents with b=True are all deleted
        reader = s.reader()
        for docnum in xrange(s.doc_count_all()):
            b = s.stored_fields(docnum)["b"]
            assert b == reader.is_deleted(docnum)

        # Try doing a search for documents where b=True
        qp = qparser.QueryParser("b", ix.schema)
        q = qp.parse("b:t")
        r = s.search(q, limit=None)
        assert len(r) == 0

        # Make sure Every query doesn't match deleted docs
        r = s.search(qp.parse("*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(qp.parse("*:*"), limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        # Make sure Not query doesn't match deleted docs
        q = qp.parse("NOT b:t")
        r = s.search(q, limit=None)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)

        r = s.search(q, limit=5)
        assert not any(hit["b"] for hit in r)
        assert not any(reader.is_deleted(hit.docnum) for hit in r)
Ejemplo n.º 10
0
    def test_query_schema_is_setup_correctly(self):
        # Given
        p = Project(name='test', path=self.root)

        # When
        p.scan()

        # Then
        schema = p._query_parser.schema
        items = schema.items()
        from whoosh import fields
        self.assertIn(('path', fields.TEXT()), items)
        self.assertIn(('ctime', fields.DATETIME()), items)
        self.assertIn(('completed', fields.BOOLEAN()), items)
        self.assertIn(('size', INT), items)
Ejemplo n.º 11
0
def crear_esquema():
    licorSchema = fields.Schema(
        id=fields.NUMERIC(stored=True),
        titulo=fields.TEXT(sortable=True, field_boost=1.5),
        descripcion=fields.TEXT,
        categoria=fields.TEXT(sortable=True),
        precio=fields.NUMERIC(Decimal, decimal_places=2, sortable=True),
        precioGroup=fields.NUMERIC(sortable=True),
        origen=fields.TEXT(sortable=True),
        graduacion=fields.NUMERIC(sortable=True),
        enStock=fields.BOOLEAN(stored=True),
        urlProducto=fields.TEXT(field_boost=0.5),
    )

    return licorSchema
Ejemplo n.º 12
0
def test_boolean_multifield():
    schema = fields.Schema(name=fields.TEXT(stored=True),
                           bit=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        w.add_document(name=u('audi'), bit=True)
        w.add_document(name=u('vw'), bit=False)
        w.add_document(name=u('porsche'), bit=False)
        w.add_document(name=u('ferrari'), bit=True)
        w.add_document(name=u('citroen'), bit=False)

    with ix.searcher() as s:
        qp = qparser.MultifieldParser(["name", "bit"], schema)
        q = qp.parse(u("boop"))

        r = s.search(q)
        assert sorted(hit["name"] for hit in r) == ["audi", "ferrari"]
        assert len(r) == 2
Ejemplo n.º 13
0
def test_boolean2():
    schema = fields.Schema(t=fields.TEXT(stored=True),
                           b=fields.BOOLEAN(stored=True))
    ix = RamStorage().create_index(schema)
    writer = ix.writer()
    writer.add_document(t=u('some kind of text'), b=False)
    writer.add_document(t=u('some other kind of text'), b=False)
    writer.add_document(t=u('some more text'), b=False)
    writer.add_document(t=u('some again'), b=True)
    writer.commit()

    with ix.searcher() as s:
        qf = qparser.QueryParser('b', None).parse(u('f'))
        qt = qparser.QueryParser('b', None).parse(u('t'))
        r = s.search(qf)
        assert len(r) == 3

        assert [d["b"] for d in s.search(qt)] == [True]
        assert [d["b"] for d in s.search(qf)] == [False] * 3
Ejemplo n.º 14
0
 def _init_schema():
     schema = fields.Schema()
     schema.add("id", fields.ID(unique=True, stored=True))
     schema.add("short_id", fields.ID(stored=True))
     schema.add("status", fields.ID(stored=True))
     schema.add("started", fields.DATETIME(stored=True))
     schema.add("stopped", fields.DATETIME(stored=True))
     schema.add("pkg_type", fields.ID(stored=True))
     schema.add("pkg_name", fields.ID(stored=True))
     schema.add("pkg_version", fields.ID(stored=True))
     schema.add("model_name", fields.ID(stored=True))
     schema.add("op_name", fields.ID(stored=True))
     schema.add("label", fields.TEXT(stored=True))
     schema.add("scalar_*", fields.NUMERIC(float, stored=True), glob=True)
     schema.add("flagi_*", fields.NUMERIC(int, stored=True), glob=True)
     schema.add("flagf_*", fields.NUMERIC(int, stored=True), glob=True)
     schema.add("flagb_*", fields.BOOLEAN(stored=True), glob=True)
     schema.add("flags_*", fields.ID(stored=True), glob=True)
     schema.add("priv_*", fields.STORED, glob=True)
     return schema
Ejemplo n.º 15
0
from whoosh import fields
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import FuzzyTermPlugin, GtLtPlugin, MultifieldParser, PhrasePlugin
from whoosh.writing import AsyncWriter

from . import ConfigNotFound, EncryptedConfigStore, KEY_FIELD_NAME
from .serializers import Serializer

from jumpscale.sals.fs import join_paths, mkdirs

# a map betwen our indexable fields and whoosh fields
# for now we don't support nested fields like Lists or Objects
# they will only be stored but not indexed
FIELD_MAP = {
    "Boolean": fields.BOOLEAN(stored=True),
    "Bytes": fields.TEXT(stored=True),
    "Email": fields.TEXT(stored=True),
    "GUID": fields.TEXT(stored=True),
    "IPAddress": fields.TEXT(stored=True),
    "IPRange": fields.TEXT(stored=True),
    "Json": fields.TEXT(stored=True),
    "Path": fields.ID(stored=True),
    "String": fields.TEXT(stored=True),
    "Tel": fields.TEXT(stored=True),
    "URL": fields.TEXT(stored=True),
    "Integer": fields.NUMERIC(bits=64, stored=True, sortable=True),
    "Float": fields.NUMERIC(float, bits=64, stored=True, sortable=True),
    "Port": fields.NUMERIC(stored=True, sortable=True),
    "Date": fields.NUMERIC(stored=True, sortable=True),
    "DateTime": fields.NUMERIC(stored=True, sortable=True),
    "Time": fields.NUMERIC(stored=True, sortable=True),
Ejemplo n.º 16
0
class Objects(object):
    INDEX_DIR = os.path.join(Globals.BASE_DIR, 'objects')
    INDEX = None
    SCHEMA = None

    TYPES = {
        u'escidoc-objid':
        fields.TEXT,
        u'old':
        fields.TEXT,
        u'text':
        fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True,
                    chars=True),
        u'num':
        fields.NUMERIC(stored=True),
        u'boolean':
        fields.BOOLEAN(stored=True),
        u'bool':
        fields.BOOLEAN(stored=True),
        u'date':
        fields.TEXT(stored=True),
        u'arabic':
        fields.TEXT(analyzer=analysis.FancyAnalyzer(), stored=True,
                    chars=True),
        u'geoname-id':
        fields.TEXT
    }

    @classmethod
    def schema_fields(cls):
        sfields = set(
            (att['ov'], att['content_type']) for att in Definitions.all_atts())
        sfields = {k: cls.TYPES[v] for k, v in sfields}
        sfields.update(ov=fields.TEXT(analyzer=analysis.FancyAnalyzer(),
                                      stored=True,
                                      chars=True),
                       nov=fields.TEXT(analyzer=analysis.FancyAnalyzer(),
                                       stored=True,
                                       chars=True),
                       oc=fields.ID,
                       id=fields.ID(stored=True, unique=True))
        return sfields

    @classmethod
    def get_schema(cls):
        if cls.SCHEMA is None:
            cls.SCHEMA = fields.Schema(**cls.schema_fields())
        return cls.SCHEMA

    @classmethod
    def get_index(cls):
        if cls.INDEX is None:
            if not os.path.exists(cls.INDEX_DIR):
                os.mkdir(cls.INDEX_DIR)
            if index.exists_in(cls.INDEX_DIR):
                cls.INDEX = index.open_dir(cls.INDEX_DIR)
            else:
                cls.INDEX = index.create_in(cls.INDEX_DIR, cls.get_schema())
                for json_file in filter(lambda s: ".json" in s,
                                        os.listdir(Globals.DATA_DIR)):
                    fp = open(os.path.join(Globals.DATA_DIR, json_file), 'r')
                    resp = json.load(fp)
                    fp.close()
                    for ent in resp['ents']:
                        cls.add_ent(ent)
        return cls.INDEX

    @classmethod
    def add_ent(cls, ent):
        writer = cls.get_index().writer()
        kwargs = {
            'ov': unicode(ent.get('ov', '')),
            'nov': unicode(ent.get('nov', '')),
            'oc': unicode(ent.get('oc', '')),
            'id': unicode(ent.get('id', ''))
        }
        if 'atts' in ent:
            kwargs.update({
                att['name'].lower(): unicode(att.get('ov', ''))
                for att in ent['atts']
            })
        writer.add_document(**kwargs)
        writer.commit()

    @classmethod
    def delete(cls, iden):
        writer = cls.get_index().writer()
        writer.delete_by_term('id', iden)
        writer.commit()

    @classmethod
    def sync_db(cls):
        for d in Definitions.get_defs():
            resp = JsonInterface.method('get_ents',
                                        oc=d['ov'],
                                        include_content='true')
            with cls.get_index().searcher() as s:
                for ent in resp['ents']:
                    stored_fields = s.document(id=unicode(ent['id']))
                    if stored_fields:
                        cls.delete(unicode(ent['id']))
                    cls.add_ent(ent)

    @classmethod
    def search(cls, filters, **kwargs):
        schema = cls.get_schema()

        def parse(filt):
            if filt.query_type == Filter.Q_APPROX:
                mp = qparser.MultifieldParser(filt.get_fields(), schema=schema)
                return mp.parse(unicode(filt.query_string))
            elif filt.query_type == Filter.Q_EXACT:
                s = cls.get_index().searcher()
                qs = filt.query_string
                f = lambda d: qs in [
                    d.get(field) for field in filt.get_fields()
                ]
                ids = [unicode(d['id']) for d in filter(f, s.documents())]
                return query.Or([query.Term('id', iden) for iden in ids])

        queries = [parse(filt) for filt in filters]
        q = query.And(queries)
        kwargs.update(limit=None, groupedby='oc')
        results = cls.get_index().searcher().search(q, **kwargs)
        return Results(results)
Ejemplo n.º 17
0
class ISPWhoosh(object):
    """
    Helper class to index the ISP model with Whoosh to allow full-text search
    """
    schema = fields.Schema(
        id=fields.ID(unique=True, stored=True),
        is_ffdn_member=fields.BOOLEAN(),
        is_disabled=fields.BOOLEAN(),
        name=fields.TEXT(),
        shortname=fields.TEXT(),
        description=fields.TEXT(),
        covered_areas=fields.KEYWORD(scorable=True, commas=True, lowercase=True),
        step=fields.NUMERIC(signed=False),
    )

    primary_key=schema._fields['id']

    @staticmethod
    def get_index_dir():
        return current_app.config.get('WHOOSH_INDEX_DIR', 'whoosh')

    @classmethod
    def get_index(cls):
        idxdir = cls.get_index_dir()
        if index.exists_in(idxdir):
            idx = index.open_dir(idxdir)
        else:
            if not os.path.exists(idxdir):
                os.makedirs(idxdir)
            idx = index.create_in(idxdir, cls.schema)
        return idx

    @classmethod
    def _search(cls, s, terms):
        return s.search(qparser.MultifieldParser([
            'name', 'shortname', 'description', 'covered_areas'
        ], schema=cls.schema).parse(terms),
           mask=whoosh.query.Term('is_disabled', True))

    @classmethod
    def search(cls, terms):
        with ISPWhoosh.get_index().searcher() as s:
            sres = cls._search(s, terms)
            ranks = {}
            for rank, r in enumerate(sres):
                ranks[r['id']] = rank

            if not len(ranks):
                return []

            _res = ISP.query.filter(ISP.id.in_(ranks.keys()))

        return sorted(_res, key=lambda r: ranks[r.id])

    @classmethod
    def update_document(cls, writer, model):
        kw = {
            'id': unicode(model.id),
            '_stored_id': model.id,
            'is_ffdn_member': model.is_ffdn_member,
            'is_disabled': model.is_disabled,
            'name': model.name,
            'shortname': model.shortname,
            'description': model.json.get('description'),
            'covered_areas': ','.join(model.covered_areas_names()),
            'step': model.json.get('progressStatus')
        }
        writer.update_document(**kw)

    @classmethod
    def _after_flush(cls, app, changes):
        isp_changes = []
        for change in changes:
            if change[0].__class__ == ISP:
                update = change[1] in ('update', 'insert')
                isp_changes.append((update, change[0]))

        if not len(changes):
            return

        idx = cls.get_index()
        with idx.writer() as writer:
            for update, model in isp_changes:
                if update:
                    cls.update_document(writer, model)
                else:
                    writer.delete_by_term(cls.primary_key, model.id)