Ejemplo n.º 1
0
def test_searching():
    with make_index().searcher() as s:

        def _runq(q, result, **kwargs):
            r = s.search(q, **kwargs)
            assert_equal([d["id"] for d in r], result)

        _runq(query.Term("text", u("format")), ["format", "vector"])
        _runq(query.Term("text", u("the")),
              ["fieldtype", "format", "const", "vector", "stored"])
        _runq(query.Prefix("text", u("st")), ["format", "vector", "stored"])
        _runq(query.Wildcard("id", u("*st*")), ["stored", "const"])
        _runq(query.TermRange("id", u("c"), u("s")),
              ["fieldtype", "format", "const"])
        _runq(query.NumericRange("subs", 10, 100),
              ["fieldtype", "format", "vector", "scorable"])
        _runq(query.Phrase("text", ["this", "field"]),
              ["scorable", "unique", "stored"],
              limit=None)
        _runq(query.Every(), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
        _runq(query.Every("subs"), [
            "fieldtype", "format", "vector", "scorable", "stored", "unique",
            "const"
        ])
Ejemplo n.º 2
0
def test_bigsort():
    times = 30000
    dirname = "testindex"
    
    df = fields.DATETIME(stored=True)
    schema = fields.Schema(id=fields.ID(stored=True), date=df)
    
    if os.path.exists(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)
    ix = index.create_in(dirname, schema)
    
    print("Writing...")
    t = now()
    w = ix.writer(limitmb=512)
    for i in xrange(times):
        dt = datetime.fromtimestamp(random.randint(15839593, 1294102139))
        w.add_document(id=text_type(i), date=dt)
    w.commit()
    print("Writing took ", now() - t)
    
    ix = index.open_dir(dirname)
    s = ix.searcher()
    q = query.Wildcard("id", "1?2*")
    
    t = now()
    x = list(df.sortable_values(s.reader(), "date"))
    print(now() - t, len(x))
    
    t = now()
    for y in x:
        p = list(s.postings("date", y).all_ids())
    print(now() - t)
    
    
    
    t = now()
    r = s.search(q, limit=25, sortedby="date", reverse=True)
    print("Search 1 took", now() - t)
    print("len=", r.scored_length())
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    t = now()
    r = s.search(q, limit=25, sortedby="date")
    print("Search 2 took", now() - t)
    
    from heapq import nlargest
    t = now()
    sf = s.stored_fields
    gen = ((sf(n)["date"], n) for n in q.docs(s))
    r = nlargest(25, gen)
    print(now() - t)
Ejemplo n.º 3
0
 def test_tamilprefix(self):
     f = open(
         '/home/nanditha/projects/tamilthedal/trunk/src/encyclopedia/utilities/pyunitwildtext'
     )
     cont = f.readline()
     text = cont.split(':')
     index = open_dir(settings.INDEX_PATH)
     wildtext = unicode(str(text[0]), 'utf-8') + u'*'
     qp = query.Wildcard("content", wildtext)
     srch = index.searcher()
     res = srch.search(qp)
     self.assertNotEqual(len(res), 0)
     print len(res), 'results'
Ejemplo n.º 4
0
def test_patterns():
    domain = u("aaron able acre adage aether after ago ahi aim ajax akimbo "
               "alembic all amiga amount ampere").split()
    schema = fields.Schema(word=fields.KEYWORD(stored=True))
    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        for word in domain:
            w.add_document(word=word)

    with ix.reader() as r:
        assert_equal(list(r.lexicon("word")), domain)

        assert_equal(list(r.expand_prefix("word", "al")), ["alembic", "all"])
        q = query.Prefix("word", "al")
        assert_equal(q.simplify(r).__unicode__(), "(word:alembic OR word:all)")

        q = query.Wildcard("word", "a*[ae]")
        assert_equal(q.simplify(r).__unicode__(),
                     "(word:able OR word:acre OR word:adage OR " +
                     "word:amiga OR word:ampere)")
        assert_equal(q._find_prefix(q.text), "a")

        q = query.Regex("word", "am.*[ae]")
        assert_equal(q.simplify(r).__unicode__(),
                     "(word:amiga OR word:ampere)")
        assert_equal(q._find_prefix(q.text), "am")

        q = query.Regex("word", "able|ago")
        assert_equal(q.simplify(r).__unicode__(), "(word:able OR word:ago)")
        assert_equal(q._find_prefix(q.text), "")

        # special case: ? may mean "zero occurences"
        q = query.Regex("word", "ah?i")
        assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)")
        assert_equal(q._find_prefix(q.text), "a")

        # special case: * may mean "zero occurences"
        q = query.Regex("word", "ah*i")
        assert_equal(q.simplify(r).__unicode__(), "(word:ahi OR word:aim)")
        assert_equal(q._find_prefix(q.text), "a")
Ejemplo n.º 5
0
 def make_wildcard(self, fieldname, text):
     fieldname = fieldname or self.default_field
     return query.Wildcard(fieldname or self.default_field, text)
Ejemplo n.º 6
0
def test_wildcard():
    _run_query(query.Or([query.Wildcard('value', u('*red*')),
                         query.Wildcard('name', u('*yellow*'))]),
               [u("A"), u("C"), u("D"), u("E")])
    # Missing
    _run_query(query.Wildcard('value', 'glonk*'), [])
def searchPapers_whoosh(year=None, author=None, topic=None, userQuery=None):

    # Open the existing index
    import whoosh.index as index
    import nltk
    nltk.download('wordnet')
    from nltk.stem.wordnet import WordNetLemmatizer
    lemma = WordNetLemmatizer()
    userQuery = " ".join(
        lemma.lemmatize(word, 'n') for word in userQuery.split())
    userQuery = " ".join(
        lemma.lemmatize(word, 'v') for word in userQuery.split())
    index_dir = "../index"

    ix = index.open_dir(index_dir)

    if topic == 'All the topics':
        topic = None
    if year == 'All the years':
        year = None
    # Parse with filter on fields
    from whoosh import query
    from whoosh import qparser
    from whoosh.qparser import QueryParser
    from whoosh.qparser import MultifieldParser

    with ix.searcher() as s:
        if (not userQuery):
            qp = QueryParser("id", schema=ix.schema)
            user_q = qp.parse("*")

        else:
            # 0 = importance to documents with one of the terms
            # 1 = importance to documents with all of the terms
            og = qparser.OrGroup.factory(0.8)

            # search both in title and text
            mparser = MultifieldParser(["title", "paper_text"],
                                       schema=ix.schema,
                                       group=og)
            user_q = mparser.parse(userQuery)

        # Filter results for fields
        allow_q = query.NullQuery

        if (year):
            allow_q = allow_q & query.Term("year", year)

        if (author):
            formattedAuthors = author.lower().split()
            for fa in formattedAuthors:
                fa = "*" + fa + "*"
                allow_q = allow_q & query.Wildcard("authors", fa)

        if (topic):
            topicParser = qparser.QueryParser("topic", ix.schema)
            allow_q = allow_q & topicParser.parse('"' + topic + '"')

        if (not year and not author and not topic):
            results = s.search(user_q, limit=50)
        else:
            results = s.search(user_q, filter=allow_q, limit=50)

        papers = []
        for result in results:
            papers.extend([int(result['id'])])
        return papers