Ejemplo n.º 1
0
def test_director_exception():
    """Test handling of an exception raised in a director.

    """
    db = setup_database()
    query = xapian.Query('it')
    enq = xapian.Enquire(db)
    enq.set_query(query)

    class TestException(Exception):
        def __init__(self, a, b):
            Exception.__init__(self, a + b)

    rset = xapian.RSet()
    rset.add_document(1)

    class EDecider(xapian.ExpandDecider):
        def __call__(self, term):
            raise TestException("foo", "bar")

    edecider = EDecider()
    expect_exception(TestException, "foobar", edecider, "foo")
    expect_exception(TestException, "foobar", enq.get_eset, 10, rset, edecider)

    class MDecider(xapian.MatchDecider):
        def __call__(self, doc):
            raise TestException("foo", "bar")

    mdecider = MDecider()
    expect_exception(TestException, "foobar", mdecider, xapian.Document())
    expect_exception(TestException, "foobar", enq.get_mset, 0, 10, None,
                     mdecider)
Ejemplo n.º 2
0
def test_eset_iter():
    """Test iterators over ESets.

    """
    db = setup_database()
    query = xapian.Query(xapian.Query.OP_OR, "was", "it")
    rset = xapian.RSet()
    rset.add_document(3)

    context("getting eset items without a query")
    enquire = xapian.Enquire(db)
    eset = enquire.get_eset(10, rset)
    items = [item for item in eset]
    expect(len(items), 3)
    expect(len(items), len(eset))

    context("getting eset items with a query")
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    eset = enquire.get_eset(10, rset)
    items2 = [item for item in eset]
    expect(len(items2), 2)
    expect(len(items2), len(eset))

    context("comparing eset items with a query to those without")
    expect(items2[0].term, items[0].term)
    expect(items2[1].term, items[2].term)

    context("comparing eset weights with a query to those without")
    expect(items2[0].weight, items[0].weight)
    expect(items2[1].weight, items[2].weight)
Ejemplo n.º 3
0
    def suggest(self,
                search,
                offset=0,
                limit=0,
                moffset=0,
                mlimit=0,
                klimit=1.0,
                kmlimit=1.0,
                prefix=None,
                decider=None,
                score=False,
                format_term=True,
                collapse_stems=True,
                include_query_terms=True,
                order=None,
                reverse=False):
        """
        Suggest terms that would possibly yield more relevant results
        for the given query.
        """
        self.backend.reopen()
        if mlimit == 0:
            mlimit = int(self.backend.get_doccount() * kmlimit)

        enq = xapian.Enquire(self.backend)
        enq.set_query(search.query)
        mset = self._build_mset(enq,
                                offset=moffset,
                                limit=mlimit,
                                order=order,
                                reverse=reverse)

        rset = xapian.RSet()
        for m in mset:
            rset.add_document(m.docid)

        if prefix is not None:
            decider = PrefixDecider(prefix)
        if decider is None:
            decider = LanguageDecider()

        if limit == 0:
            limit = int(self.backend.get_doccount() * klimit)

        eset = enq.get_eset(
            limit, rset, enq.INCLUDE_QUERY_TERMS if include_query_terms else 0,
            1.0, decider, -3)

        for item in eset.items:
            val = item[0].decode('utf8')
            yield (val, item[1])
Ejemplo n.º 4
0
    def get_suggestions(self, count=10, filter=None):
        """
        Compute suggestions for more terms

        Return a Xapian ESet
        """
        # Use the first 30 results as the key ones to use to compute relevant
        # terms
        rset = xapian.RSet()
        for m in self.enquire.get_mset(0, 30):
            rset.add_document(m.docid)

        # Get results, optionally filtered
        if filter is None:
            filter = self.BasicFilter()

        return self.enquire.get_eset(count, rset, filter)
Ejemplo n.º 5
0
    def xapian_search(self, k=100, showscore=True):

        print self.database.xapian
        dbpath_doc = self.database.xapian
        db_doc = xapian.Database(dbpath_doc)
        doc_qp = xapian.QueryParser()
        doc_qp.set_stemmer(xapian.Stem("en"))

        doc_qp.set_stopper(self.stopper)
        doc_qp.set_database(db_doc)
        #doc_qp.set_default_op( xapian.Query.OP_ELITE_SET)
        doc_qp.set_default_op(xapian.Query.OP_AND)
        doc_qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        doc_query = doc_qp.parse_query(self.queryString)

        offset, limit = 0, min(k, db_doc.get_doccount())
        print limit
        enquire = xapian.Enquire(db_doc)
        enquire.set_query(doc_query)
        doc_matches = enquire.get_mset(offset, limit)
        rset = xapian.RSet()

        ids = []
        scores = []
        match_terms = {}
        for match in doc_matches:
            rset.add_document(match.docid)

            document = match.document
            scores.append(match.weight)

            ids.append(document.get_data())
            m_terms = enquire.matching_terms(match)
            match_terms[match.docid] = [term for term in m_terms]

        self.ranked_ids = ids
        self.match_terms = match_terms
        alternatives = enquire.get_eset(100, rset, 0)
        # print alternatives
        self.alternatives = {}
        for a in alternatives.items:
            self.alternatives[a[1]] = a[0]

        if showscore:
            return ids, scores
Ejemplo n.º 6
0
 def eset_profile(self, items_repository, size, content_filter):
     """
     Return most relevant tags for a list of packages.
     """
     # Store package documents in a relevant set
     enquire = xapian.Enquire(items_repository)
     docs = data.axi_search_pkgs(items_repository, self.pkg_profile)
     rset_packages = xapian.RSet()
     for d in docs:
         rset_packages.add_document(d.docid)
     # Get expanded query terms (statistically good differentiators)
     eset_tags = enquire.get_eset(size * 2, rset_packages,
                                  xapian.Enquire.INCLUDE_QUERY_TERMS, 1,
                                  content_filter)
     # Eliminate duplicated stemmed term
     profile = self._eliminate_duplicated([res.term for res in eset_tags],
                                          size)
     return profile
 def test_eset(self):
     """ test finding "similar" items than the ones found before """
     query = xapian.Query("foo")
     self.enquire.set_query(query)
     # this yields very few results
     matches = self.enquire.get_mset(0, 100)
     # create a relevance set from the query
     rset = xapian.RSet()
     #print "original finds: "
     for match in matches:
         #print match.document.get_data()
         rset.add_document(match.docid)
     # and use that to get a extended set
     eset = self.enquire.get_eset(20, rset)
     #print eset
     # build a query from the eset
     eset_query = xapian.Query(xapian.Query.OP_OR, [e.term for e in eset])
     self.enquire.set_query(eset_query)
     # ensure we have more results now than before
     eset_matches = self.enquire.get_mset(0, 100)
     self.assertTrue(len(matches) < len(eset_matches))
Ejemplo n.º 8
0
 def run(self, rec, user, recommendation_size):
     """
     Perform recommendation strategy.
     """
     temp_index = xapian.WritableDatabase("/tmp/Database",
                                          xapian.DB_CREATE_OR_OVERWRITE)
     profile = self.get_user_profile(user, rec)
     doc = xapian.Document()
     for pkg in profile:
         doc.add_term(pkg)
     doc.add_term("TO_BE_DELETED")
     docid = temp_index.add_document(doc)
     temp_index.add_database(rec.users_repository)
     rset = xapian.RSet()
     rset.add_document(docid)
     # rset = self.get_rset_from_profile(profile)
     enquire = xapian.Enquire(temp_index)
     enquire.set_weighting_scheme(rec.weight)
     eset = enquire.get_eset(recommendation_size, rset,
                             PkgExpandDecider(user.items()))
     result = self.get_result_from_eset(eset)
     return result
Ejemplo n.º 9
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3,
           'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1',
           'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print("Unhandled constants: ", res)
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    #
    # Python 3.5 generates a different exception message here to earlier
    # versions, so we need a check which matches both.
    expect_exception(AttributeError,
                     lambda msg: msg.find("has no attribute 'cvar'") != -1,
                     access_cvar)

    stem = xapian.Stem(b"english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data(b"a\0b")
    if doc.get_data() == b"a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), b"a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data(b"is there anybody out there?")
    doc.add_term(b"XYzzy")
    doc.add_posting(stem(b"is"), 1)
    doc.add_posting(stem(b"there"), 2)
    doc.add_posting(stem(b"anybody"), 3)
    doc.add_posting(stem(b"out"), 4)
    doc.add_posting(stem(b"there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]),
        "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE,
                          (b"smoke", b"test", b"tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query(b"smoke"), query1, b"string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]),
        "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected (or not wrapped
    # in the first cases):

    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb"))
    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN))

    expect_exception(
        xapian.DatabaseOpeningError, None,
        lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB))
    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB))

    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS))
    expect_exception(
        xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS))

    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT))
    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT))

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"/bin/false", b"")
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"/bin/false", b"")

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = b" ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, b"is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist(b"there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(b""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, b"there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to(b'n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < b'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" %
                           x.term.decode('utf-8'))

    # Feature test for Document.values
    count = 0
    for term in list(doc.values()):
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data(b"Two")
    doc.add_posting(stem(b"out"), 1)
    doc.add_posting(stem(b"outside"), 1)
    doc.add_posting(stem(b"source"), 2)
    doc.add_value(0, b"yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == b"yes"

    query = xapian.Query(stem(b"out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith(b'a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith(b'a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     b"test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem(b'en'))
    expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))")

    expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')),
                 '(foo OR bar\\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')),
                 '(foo OR bar\u00a3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\u00e9st@1)")

    doc = xapian.Document()
    doc.set_data(b"Unicode with an acc\xe9nt")
    doc.add_posting(stem(b"out\xe9r"), 1)
    expect(doc.get_data(), b"Unicode with an acc\xe9nt")
    term = next(doc.termlist()).term
    expect(term, b"out\xe9r")

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add(b'a')
    expect(stop(b'a'), True)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == b"b"

        def get_description(self):
            return "my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), "my_b_stopper")
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop(b'b'), True)
    expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text(b'foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]),
                                          (b'foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query(b'12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, b'$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b.encode('utf-8'))
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")

    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    # FIXME: This doesn't currently work:
    # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query(b"I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata(b'Foo'), b'')
    db.set_metadata(b'Foo', b'Foo')
    expect(db.get_metadata(b'Foo'), b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, b'',
                     b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5),
        "5 * foo")
Ejemplo n.º 10
0
    def more_like_this(self,
                       model_instance,
                       additional_query=None,
                       start_offset=0,
                       end_offset=None,
                       limit_to_registered_models=True,
                       result_class=None,
                       **kwargs):
        """
        Given a model instance, returns a result set of similar documents.

        Required arguments:
            `model_instance` -- The model instance to use as a basis for
                                retrieving similar documents.

        Optional arguments:
            `additional_query` -- An additional query to narrow results
            `start_offset` -- The starting offset (default=0)
            `end_offset` -- The ending offset (default=None), if None, then all documents
            `limit_to_registered_models` -- Limit returned results to models registered in the current `SearchSite` (default = True)

        Returns:
            A dictionary with the following keys:
                `results` -- A list of `SearchResult`
                `hits` -- The total available results

        Opens a database connection, then builds a simple query using the
        `model_instance` to build the unique identifier.

        For each document retrieved(should always be one), adds an entry into
        an RSet (relevance set) with the document id, then, uses the RSet
        to query for an ESet (A set of terms that can be used to suggest
        expansions to the original query), omitting any document that was in
        the original query.

        Finally, processes the resulting matches and returns.
        """
        database = self._database()

        if result_class is None:
            result_class = SearchResult

        query = xapian.Query(DOCUMENT_ID_TERM_PREFIX +
                             get_identifier(model_instance))

        enquire = xapian.Enquire(database)
        enquire.set_query(query)

        rset = xapian.RSet()

        if not end_offset:
            end_offset = database.get_doccount()

        for match in self._get_enquire_mset(database, enquire, 0, end_offset):
            rset.add_document(match.docid)

        query = xapian.Query(xapian.Query.OP_ELITE_SET, [
            expand.term for expand in enquire.get_eset(
                match.document.termlist_count(), rset, XHExpandDecider())
        ], match.document.termlist_count())
        query = xapian.Query(
            xapian.Query.OP_AND_NOT,
            [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)])
        if limit_to_registered_models:
            registered_models = self.build_models_list()

            if len(registered_models) > 0:
                query = xapian.Query(
                    xapian.Query.OP_AND, query,
                    xapian.Query(xapian.Query.OP_OR, [
                        xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model))
                        for model in registered_models
                    ]))
        if additional_query:
            query = xapian.Query(xapian.Query.OP_AND, query, additional_query)

        enquire.set_query(query)

        results = []
        matches = self._get_enquire_mset(database, enquire, start_offset,
                                         end_offset)

        for match in matches:
            app_label, module_name, pk, model_data = pickle.loads(
                self._get_document_data(database, match.document))
            results.append(
                result_class(app_label, module_name, pk, match.percent,
                             **model_data))

        return {
            'results': results,
            'hits': self._get_hit_count(database, enquire),
            'facets': {
                'fields': {},
                'dates': {},
                'queries': {},
            },
            'spelling_suggestion': None,
        }
Ejemplo n.º 11
0
    def run(self, search_options, progressbar=None):
        # Matching set
        logging.debug('Getting MSet')
        progressbar.set_text('0%')
        while gtk.events_pending():
            gtk.main_iteration()
        mset = search_options['enquire'].get_mset(
            0,
            search_options['n_mset'],
            0,
            None,
            #MMMatchDeciderAlwaysTrue(progressbar, 1/float(n_mset + n_eset)))
            #MMMatchDeciderAlwaysTrue())
            None)

        # Results set
        logging.debug('Getting RSet')
        progressbar.set_fraction(0.33)
        progressbar.set_text('33%')
        while gtk.events_pending():
            gtk.main_iteration()

        docs = []
        rset = xapian.RSet()
        for y, d in enumerate(mset):
            if y < search_options['n_mset']:
                rset.add_document(d.docid)
                docs.append([
                    d.percent,
                    d.document.get_data(),
                    d.document.get_value(2)
                ])
            else:
                logging.warning(
                    'More docs in mset than expected, something is wrong')

        # Obtain the "Expansion set" for the search: the n most relevant terms that
        # match the filter
        logging.debug('Getting ESet')
        progressbar.set_fraction(0.66)
        progressbar.set_text('66%')
        while gtk.events_pending():
            gtk.main_iteration()
        eset = search_options['enquire'].get_eset(
            search_options['n_eset'],
            rset,
            search_options[
                'eset_showqueryterms'],  # 0 = exclude query terms in eset; 1 = include query terms in eset
            1,
            #MMRsetFilter(stopwords[lang], [], progressbar, 1/float(n_mset + n_eset)))
            MMEsetFilter(stopwords[search_options['selected_language']],
                         search_options['eset_white_list']))

        # Read the "Expansion set" and scan tags and their score
        tagscores = dict()
        for item in eset:
            tag = item.term
            tagscores[tag] = item.weight

        tags = []
        if tagscores != dict():
            maxscore = max(tagscores.itervalues())
            minscore = min(tagscores.itervalues())
            for k in tagscores.iterkeys():
                tags.append([
                    k, (tagscores[k] - minscore) * 100 /
                    (maxscore - minscore) * 3 + 75
                ])
            # sort by tag alphabetically
            tags.sort()

        return docs, tags
Ejemplo n.º 12
0
    def run(self, search_options, progressbar=None):
        logging.debug('Getting MSet')
        progressbar.set_text('0%')
        while gtk.events_pending():
            gtk.main_iteration()
        mset = search_options['enquire'].get_mset(
            0,
            search_options['n_mset'],
            0,
            None,
            #MMMatchDeciderAlwaysTrue(progressbar, 1/float(self.n_mset + self.n_eset + self.n_eset*self.n_eset)))
            #MMMatchDeciderAlwaysTrue())
            None)

        logging.debug('Getting RSet')
        progressbar.set_fraction(0.25)
        progressbar.set_text('25%')
        while gtk.events_pending():
            gtk.main_iteration()
        rset = xapian.RSet()
        for y, m in enumerate(mset):
            rset.add_document(m.docid)

        logging.debug('Getting ESet')
        progressbar.set_fraction(0.5)
        progressbar.set_text('50%')
        while gtk.events_pending():
            gtk.main_iteration()
        eset = search_options['enquire'].get_eset(
            search_options['n_eset'],
            rset,
            search_options[
                'eset_showqueryterms'],  # 0 = exclude query terms in eset; 1 = include query terms in eset
            1,
            MMEsetFilter(stopwords[search_options['selected_language']],
                         search_options['eset_white_list']))

        progressbar.set_fraction(0.75)
        progressbar.set_text('75%')
        while gtk.events_pending():
            gtk.main_iteration()

        logging.debug('Calculating distances on %i terms' % len(eset))
        positions_matrix = {}
        wdf_dict = {}
        for ki, keyword in enumerate(eset):
            positions_arrays = {}
            freq = 0
            for m in mset:
                docid = m.docid
                try:
                    positions_array = set(search_options['db'].positionlist(
                        docid, keyword.term))
                except xapian.RangeError:
                    positions_array = []
                positions_arrays[docid] = positions_array

                tl = search_options['db'].get_document(docid).termlist()
                try:
                    wdf = tl.skip_to(keyword.term).wdf
                except:
                    continue
                else:
                    if wdf_dict.has_key(ki):
                        wdf_dict[ki] += wdf
                    else:
                        wdf_dict[ki] = wdf

            positions_matrix[ki] = positions_arrays
            wdf_dict[ki] /= float(len(mset))
            #print "weight (%s): %f" % (keyword.term, wdf_dict[ki])

            if progressbar is not None:
                fraction = 0.75 + 0.125 / float(search_options['n_eset']) * ki
                progressbar.set_fraction(fraction)
                progressbar.set_text('%.0f%%' % (fraction * 100))
                while gtk.events_pending():
                    gtk.main_iteration()

        full_distances_list = []
        for ki, keyword in enumerate(eset):
            for oi, other in enumerate(eset):
                if keyword.term < other.term:
                    distance = 0

                    for m in mset:
                        doc_distances = []
                        docid = m.docid
                        for i in positions_matrix[ki][docid]:
                            for j in positions_matrix[oi][docid]:
                                doc_distances.append(abs(i - j))
                        # doc_distances contiene le distanze di tutte le
                        # possibili coppie di occorrenze di i e j nel documento.
                        # Noi teniamo solo le max(wdf_i, wdf_j) coppie che hanno
                        # distanza minima
                        tl = search_options['db'].get_document(
                            docid).termlist()

                        try:
                            keyword_wdf = tl.skip_to(keyword.term).wdf
                            other_wdf = tl.skip_to(other.term).wdf
                        except:
                            pass
                        num_kept_distances = max(keyword_wdf, other_wdf)
                        if doc_distances != []:
                            doc_distances.sort()
                            distance += sum([
                                1 / float(i)
                                for i in doc_distances[:num_kept_distances]
                            ])
                            #print "%s(%d), %s(%d): dist=%s, kept=%i, kept_dist=%s, doc=%d(%d), dist=%f" % (keyword.term, keyword_wdf, other.term, other_wdf, doc_distances, num_kept_distances, doc_distances[:num_kept_distances], docid, len(mset), distance)

                    if distance != 0:
                        f = lambda x: x / float(num_kept_distances) / float(
                            len(mset))
                        #print "%s, %s: %f" % (keyword.term, other.term, f(distance))

                        full_distances_list.append([
                            keyword.term, other.term,
                            f(distance), wdf_dict[ki], wdf_dict[oi]
                        ])
                if progressbar is not None:
                    fraction = 0.875 + 0.125 / float(
                        search_options['n_eset']) * ki
                    progressbar.set_fraction(fraction)
                    progressbar.set_text('%.0f%%' % (fraction * 100))
                    while gtk.events_pending():
                        gtk.main_iteration()
        #print full_distances_list
        return full_distances_list
Ejemplo n.º 13
0
    def run(self, search_options, progressbar=None):
        logging.debug('Getting MSet')
        progressbar.set_text('0%')
        while gtk.events_pending():
            gtk.main_iteration()
        mset = search_options['enquire'].get_mset(
            0,
            search_options['n_mset'],
            0,
            None,
            #MMMatchDeciderAlwaysTrue(progressbar, 1/float(self.n_mset + self.n_eset + self.n_eset*self.n_eset)))
            #MMMatchDeciderAlwaysTrue())
            None)

        logging.debug('Getting RSet')
        progressbar.set_fraction(0.25)
        progressbar.set_text('25%')
        while gtk.events_pending():
            gtk.main_iteration()
        rset = xapian.RSet()
        for y, m in enumerate(mset):
            rset.add_document(m.docid)

        logging.debug('Getting ESet')
        progressbar.set_fraction(0.5)
        progressbar.set_text('50%')
        while gtk.events_pending():
            gtk.main_iteration()
        eset = search_options['enquire'].get_eset(
            search_options['n_eset'] + 1,
            rset,
            search_options[
                'eset_showqueryterms'],  # 0 = exclude query terms in eset; 1 = include query terms in eset
            1,
            MMEsetFilter(stopwords[search_options['selected_language']],
                         search_options['eset_white_list']))

        logging.debug('Calculating distances on %i terms' % len(eset))
        progressbar.set_fraction(0.75)
        progressbar.set_text('75%')
        while gtk.events_pending():
            gtk.main_iteration()

        positions_matrix = {}
        for ki, keyword in enumerate(eset):
            positions_arrays = {}
            for m in mset:
                docid = m.docid
                try:
                    positions_array = set(search_options['db'].positionlist(
                        docid, keyword.term))
                except xapian.RangeError:
                    positions_array = []
                positions_arrays[docid] = positions_array
            positions_matrix[ki] = positions_arrays

            if progressbar is not None:
                fraction = progressbar.get_fraction() + 0.125 / float(
                    search_options['n_eset'])
                progressbar.set_fraction(fraction)
                progressbar.set_text('%.0f%%' % (fraction * 100))
                while gtk.events_pending():
                    gtk.main_iteration()

        distances_list = []
        for ki, keyword in enumerate(eset):
            for oi, other in enumerate(eset):
                if ki < oi:
                    distances = []
                    for m in mset:
                        docid = m.docid
                        count = []
                        for i in positions_matrix[ki][docid]:
                            for j in positions_matrix[oi][docid]:
                                count.append(abs(i - j))
                        if count != []:
                            distances.append(min(count))

                    if distances != []:
                        #print ",".join([keyword, other, "%f" % (sum(distances)/float(len(distances)))])

                        f = lambda x: 1 / float(
                            sum(x) / float(search_options['n_mset']))

                        distances_list.append([
                            keyword.term, other.term,
                            f(distances), keyword.weight, other.weight
                        ])
                        #distances_list.append([other.term,
                        #                       keyword.term,
                        #                       f(distances),
                        #                       other.weight,
                        #                       keyword.weight])
                if progressbar is not None:
                    fraction = progressbar.get_fraction() + 0.125 / float(
                        search_options['n_eset'] * search_options['n_eset'])
                    progressbar.set_fraction(fraction)
                    progressbar.set_text('%.0f%%' % (fraction * 100))
                    while gtk.events_pending():
                        gtk.main_iteration()

        return distances_list
Ejemplo n.º 14
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(),
                      xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print "Unhandled constants: ", res
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem("english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data("a\0b")
    if doc.get_data() == "a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), "a\0b", "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data("is there anybody out there?")
    doc.add_term("XYzzy")
    doc.add_posting(stem("is"), 1)
    doc.add_posting(stem("there"), 2)
    doc.add_posting(stem("anybody"), 3)
    doc.add_posting(stem("out"), 4)
    doc.add_posting(stem("there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(xapian.Query(xapian.Query.OP_OR, terms),
                 "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query("smoke"), query1, "string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected:

    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.Database, "nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)
    expect_exception(xapian.DatabaseNotFoundError, None,
                     xapian.WritableDatabase, "nosuchdir/nosuchdb", xapian.DB_OPEN|xapian.DB_BACKEND_STUB)

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "/bin/false", "")
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "/bin/false", "")

    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open, "127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None,
                     xapian.remote_open_writable, "127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)

    # Check Xapian::BAD_VALUENO is wrapped suitably.
    enq.set_collapse_key(xapian.BAD_VALUENO)

    enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = " ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, "is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist("there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, "there"):
        count += 1
    expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to('n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < 'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term)

    # Feature test for Document.values
    count = 0
    for term in doc.values():
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data("Two")
    doc.add_posting(stem("out"), 1)
    doc.add_posting(stem("outside"), 1)
    doc.add_posting(stem("source"), 2)
    doc.add_value(0, "yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == "yes"

    query = xapian.Query(stem("out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1, "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith('a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith('a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, "test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem('en'))
    expect_query(qp.parse_query("foo ox", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM ox OR Zox@2))")

    expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND (WILDCARD SYNONYM outside OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'),
                 '(foo OR bar)')

    expect_query(qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
                 "(0 * <alldocuments> AND_NOT Zt\xc3\xa9st@1)")

    doc = xapian.Document()
    doc.set_data(u"Unicode with an acc\xe9nt")
    doc.add_posting(stem(u"out\xe9r"), 1)
    expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8'))
    term = doc.termlist().next().term
    expect(term, u"out\xe9r".encode('utf-8'))

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add('a')
    expect(stop('a'), True)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == "b"

        def get_description(self):
            return u"my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), u"my_b_stopper")
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop('b'), True)
    expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test SimpleStopper initialised from a file.
    try:
        srcdir = os.environ['srcdir']
    except KeyError:
        srcdir = '.'
    stop = xapian.SimpleStopper(srcdir + '/../shortstop.list')
    expect(stop('a'), True)
    expect(stop('am'), False)
    expect(stop('an'), True)
    expect(stop('the'), True)

    expect_exception(xapian.InvalidArgumentError, None, xapian.SimpleStopper, 'nosuchfile')

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text('foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]), ('foo', 2, [1, 4])])


    # Check DateRangeProcessor works
    context("checking that DateRangeProcessor works")
    qp = xapian.QueryParser()
    rpdate = xapian.DateRangeProcessor(1, xapian.RP_DATE_PREFER_MDY, 1960)
    qp.add_rangeprocessor(rpdate)
    query = qp.parse_query('12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")
    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam2', testfieldprocessor(), False) # Old-style
    qp.add_boolean_prefix('boolspam3', testfieldprocessor(), '')
    qp.add_boolean_prefix('boolspam4', testfieldprocessor(), 'group')
    qp.add_boolean_prefix('boolspam5', testfieldprocessor(), None)
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query("I like tea")

    # Regression test for bug fixed in 1.4.4:
    # https://bugs.debian.org/849722
    oqparser.add_boolean_prefix('tag', 'K', '')
    # Make sure other cases also work:
    oqparser.add_boolean_prefix('zag', 'XR', False) # Old-style
    oqparser.add_boolean_prefix('rag', 'XR', None)
    oqparser.add_boolean_prefix('nag', 'XB', '')
    oqparser.add_boolean_prefix('bag', 'XB', 'blergh')
    oqparser.add_boolean_prefix('gag', 'XB', u'blergh')
    oqparser.add_boolean_prefix('jag', 'XB', b'blergh')

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata('Foo'), '')
    db.set_metadata('Foo', 'Foo')
    expect(db.get_metadata('Foo'), 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, '', 'Foo')
    expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, '')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5),
                 "5 * foo")
Ejemplo n.º 15
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    def access_cvar():
        return xapian.cvar

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    expect_exception(AttributeError, "'module' object has no attribute 'cvar'",
                     access_cvar)

    stem = xapian.Stem("english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data("a\0b")
    if doc.get_data() == "a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), "a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data("is there anybody out there?")
    doc.add_term("XYzzy")
    doc.add_posting(stem("is"), 1)
    doc.add_posting(stem("there"), 2)
    doc.add_posting(stem("anybody"), 3)
    doc.add_posting(stem("out"), 4)
    doc.add_posting(stem("there"), 5)

    db = xapian.inmemory_open()
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(xapian.Query(xapian.Query.OP_OR, terms),
                 "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE, ("smoke", "test", "tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query("smoke"), query1, "string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(xapian.Query(xapian.Query.OP_OR, subqs), "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, '1', '4'),
                 "VALUE_RANGE 0 1 4")

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, "there", "is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = " ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, "is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist("there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, "there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to('n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < 'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term)

    # Feature test for Document.values
    count = 0
    for term in doc.values():
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data("Two")
    doc.add_posting(stem("out"), 1)
    doc.add_posting(stem("outside"), 1)
    doc.add_posting(stem("source"), 2)
    doc.add_value(0, "yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == "yes"

    query = xapian.Query(stem("out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith('a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [term[xapian.ESET_TNAME] for term in eset.items]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith('a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect(eset.items[-1][xapian.ESET_WT] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect(eset.items[-1][xapian.ESET_WT] >= 1.9, True,
           "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     "test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query("NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test:(pos=1))")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem('en'))
    expect_query(
        qp.parse_query("foo o", qp.FLAG_PARTIAL),
        "(Zfoo:(pos=1) AND ((out:(pos=2) SYNONYM outsid:(pos=2)) OR Zo:(pos=2)))"
    )

    expect_query(qp.parse_query("foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo:(pos=1) AND Zoutsid:(pos=2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (u'foo', u'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', u'bar\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, ('foo', 'bar\xc2\xa3')),
                 '(foo OR bar\xc2\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, u'foo', u'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(u"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\xc3\xa9st:(pos=1))")

    doc = xapian.Document()
    doc.set_data(u"Unicode with an acc\xe9nt")
    doc.add_posting(stem(u"out\xe9r"), 1)
    expect(doc.get_data(), u"Unicode with an acc\xe9nt".encode('utf-8'))
    term = doc.termlist().next().term
    expect(term, u"out\xe9r".encode('utf-8'))

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))")

    stop.add('a')
    expect(stop('a'), True)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2))")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == "b"

        def get_description(self):
            return u"my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), u"my_b_stopper")
    qp.set_stopper(stop)
    expect(stop('a'), False)
    expect_query(qp.parse_query(u"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2) AND Za:(pos=3))")

    expect(stop('b'), True)
    expect_query(qp.parse_query(u"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo:(pos=1) AND Zbar:(pos=2))")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text('foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [('bar', 1, [2]), ('baz', 1, [3]),
                                          ('foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query('12/03/99..12/04/01')
    expect(str(query), 'Xapian::Query(VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, '$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b)
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query("I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata('Foo'), '')
    db.set_metadata('Foo', 'Foo')
    expect(db.get_metadata('Foo'), 'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, '')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, '',
                     'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, '')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query('foo'), 5),
        "5 * foo")
Ejemplo n.º 16
0
    # Combine command line arguments up to "--" with spaces between
    # them, so that simple queries don't have to be quoted at the shell
    # level.
    query_string = sys.argv[2]
    index = 3
    while index < len(sys.argv):
        arg = sys.argv[index]
        index += 1
        if arg == '--':
            # Passed marker, move to parsing relevant docids.
            break
        query_string += ' '
        query_string += arg

    # Create an RSet with the listed docids in.
    reldocs = xapian.RSet()
    for index in range(index, len(sys.argv)):
        reldocs.add_document(int(sys.argv[index]))

    # Parse the query string to produce a Xapian::Query object.
    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)

    if not query.empty():
        print("Parsed query is: %s" % str(query))

        # Find the top 10 results for the query.
enquire.set_query(query)

# Now, instead of showing the results of the query, we ask Xapian what are the
# terms in the index that are most relevant to this search.
# Normally, you would use the results to suggest the user possible ways for
# refining the search.  I instead abuse this feature to see what are the tags
# that are most related to the search results.

# Use an adaptive cutoff to avoid to pick bad results as references
matches = enquire.get_mset(0, 1)
topWeight = matches[0].weight
enquire.set_cutoff(0, topWeight * 0.7)

# Select the first 10 documents as the key ones to use to compute relevant
# terms
rset = xapian.RSet()
for m in enquire.get_mset(0, 10):
    rset.add_document(m.docid)


# Xapian supports providing a filter object, to say that we are only interested
# in some terms.
# This one filters out all the keywords that are not tags, or that were in the
# list of query terms.
class Filter(xapian.ExpandDecider):
    def __call__(self, term):
        """
        Return true if we want the term, else false
        """
        return term[:2] == "XT"
Ejemplo n.º 18
0
 def get_neighborhood_rset(self, user, rec):
     mset = self.get_neighborhood(user, rec)
     rset = xapian.RSet()
     for m in mset:
         rset.add_document(m.document.get_docid())
     return rset