Exemple #1
0
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stem_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    # Start of example code.
    enquire.set_sort_by_value_then_relevance(1, False)
    # End of example code.

    # And print out something about each match
    matches = []
    for index, match in enumerate(enquire.get_mset(offset, pagesize)):
        fields = json.loads(match.document.get_data())
        print u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n        Population %(pop)s" % {
            'rank': offset + index + 1,
            'docid': match.docid,
            'name': fields.get('name', u''),
            'date': fields.get('admitted', u''),
            'pop': fields.get('population', u''),
            'lat': fields.get('latitude', u''),
            'lon': fields.get('longitude', u''),
            }
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stem_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")
    # and add in value range processors
    queryparser.add_valuerangeprocessor(
        xapian.NumberValueRangeProcessor(0, 'mm', False))
    queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor(
        1, ''))

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        print u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n        %(title)s" % {
            'rank': match.rank + 1,
            'docid': match.docid,
            'measurements': fields.get('MEASUREMENTS', u''),
            'date': fields.get('DATE_MADE', u''),
            'title': fields.get('TITLE', u''),
        }
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    support.log_matches(querystring, offset, pagesize, matches)
Exemple #3
0
    def reload_database(self):  # {{{
        # create the xapian handlers
        self.db = xapian.WritableDatabase(self.database,
                                          xapian.DB_CREATE_OR_OPEN)

        self.qp = xapian.QueryParser()
        self.qp.set_stemmer(xapian.Stem(self.language))
        self.qp.set_stemming_strategy(self.qp.STEM_SOME)
        self.qp.add_prefix("title", "S")

        self.tg = xapian.TermGenerator()
        self.tg.set_stemmer(xapian.Stem(self.language))
        try:
            self.tg.set_stemming_strategy(self.tg.STEM_SOME)
        except AttributeError:
            pass

        self.e = xapian.Enquire(self.db)
        self.sorted_e = xapian.Enquire(self.db)
        # Value 2 is the lowercase form of the title
        self.sorted_e.set_sort_by_value(2, False)
Exemple #4
0
 def _search(self, text, pagenum=1, limit=10, isPath=False):
     database = self._index.index
     # Start an enquire session.
     enquire = xapian.Enquire(database)
     # Parse the query string to produce a Xapian::Query object.
     self.parser = xapian.QueryParser()
     self.parser.set_stemmer(xapian.Stem("english"))
     self.parser.set_database(database)
     self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
     self.query = self.parser.parse_query(text, DEFAULT_SEARCH_FLAGS)
     # find using the parsed query
     enquire.set_query(self.query)
     offset = pagenum * limit - limit
     matches = enquire.get_mset(offset, limit)
     # build results
     results = XapianResults(self,
                             matches,
                             total_count=matches.get_matches_estimated(),
                             pagenum=pagenum,
                             limit=limit)
     return results
Exemple #5
0
 def __init__(self, dbpath='/home/mirage/Downloads/weiboxa/simple'):
     database = xapian.Database(dbpath)
     enquire = xapian.Enquire(database)
     qp = xapian.QueryParser()
     stemmer = xapian.Stem("english")
     qp.set_stemmer(stemmer)
     qp.set_database(database)
     qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
     self.qp = qp
     self.enquire = enquire
     self.emotionvi = 0
     self.keywordsvi = 1
     self.timestampvi = 2
     self.loctvi = 3
     self.reploctvi = 4
     self.emotiononlyvi = 5
     #usernamevi = 6
     self.hashtagsvi = 7
     #uidvi = 8
     #repnameslistvi = 9
     self.maxitems = 1000000000
Exemple #6
0
def enquire(querystring, versions=None):
    try:
        db = xapian.Database(settings.XAPIAN_DB_PATH)
    except xapian.DatabaseOpeningError:
        return None

    qp = xapian.QueryParser()
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem("en"))
    qp.set_database(db)

    query = qp.parse_query(querystring, qp.FLAG_PARTIAL)

    if versions:
        query = xapian.Query(xapian.Query.OP_FILTER,
                             query,
                             make_version_queries(versions))

    enquiry = xapian.Enquire(db)
    enquiry.set_query(query)
    return db, enquiry
Exemple #7
0
def search(dbpath, querystring, option=0, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrive
    db = xapian.Database(dbpath)
    queryparser = xapian.QueryParser()

    # choose a language
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)

    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")

    query = queryparser.parse_query(querystring)

    enquire = xapian.Enquire(db)

    #select different weighting schema
    bm = select_weight(option)
    enquire.set_weighting_scheme(bm)

    enquire.set_query(query)

    matches = []
    ret = ""
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        tmp = u"%(rank)i: #%(docid)3.3i %(title)s" % {
            'rank': match.rank + 1,
            'docid': match.docid,
            'title': fields.get('TITLE', u''),
        }
        ret += tmp
        ret += '\n'
        matches.append(match.docid)
    support.log_matches(querystring, offset, pagesize, matches)
    return ret


### END of function
def xapian_get_ranked_index(index, pattern, params, hitset,
                            ranked_result_amount):
    """
    Queries a Xapian index.
    Returns: a list of ranked record ids [(recid, score), ...) contained in hitset
             and an intbitset of record ids contained in hitset.
    """
    result = []
    matched_recs = intbitset()

    database = DATABASES[index]
    enquire = xapian.Enquire(database)
    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)

    # Avoids phrase search to increase performance
    if "avoid_phrase_search_threshold" in params and len(hitset) >= params[
            "avoid_phrase_search_threshold"] and pattern.startswith('"'):
        pattern = pattern[1:-1]
        query_string = ' AND '.join(pattern.split(' '))
        pattern = qp.parse_query(query_string)
    else:
        query_string = pattern
        pattern = qp.parse_query(query_string, xapian.QueryParser.FLAG_PHRASE)

    enquire.set_query(pattern)
    matches = enquire.get_mset(0, ranked_result_amount, None,
                               MatchDecider(hitset))

    weight = params["weight"]
    for match in matches:
        recid = match.docid
        if recid in hitset:
            score = int(match.percent) * weight
            result.append((recid, score))
            matched_recs.add(recid)
    return (result, matched_recs)
Exemple #9
0
def search(query, active_element, numresults=10):
    # Put active_element at end of query
    qfields = sorted(query, key=lambda k: k == active_element)
    # XXX There  should be a way  to do this without  going through an
    # intermediate string, and without adding prefixes.
    qvalues = [(k, e) for k in qfields for e in query[k].split()]
    qstring = ['%s:%s' % (field, value) for field, value in qvalues]
    querystring = ' AND '.join(qstring)
    db = xapian.Database(dbpath)
    queryparser = xapian.QueryParser()
    queryparser.set_database(db)
    for field, abbrev in fields.items():
        queryparser.add_prefix(field, abbrev)
    query = queryparser.parse_query(querystring,
                                    queryparser.FLAG_BOOLEAN |
                                    queryparser.FLAG_PARTIAL |
                                    queryparser.FLAG_WILDCARD)
    enquire = xapian.Enquire(db)
    enquire.set_weighting_scheme(xapian.BoolWeight())
    enquire.set_query(query)
    return [json.loads(r.document.get_data())
            for r in enquire.get_mset(0, numresults)]
Exemple #10
0
    def query_parser(self, database=None):
        """Return a xapian.QueryParser object set up for this fieldmap.
        
        `database` should be supplied if you want to use spelling correction etc.
        
        """

        qp = xapian.QueryParser()
        if database:
            qp.set_database(database)

        if self.language:
            qp.set_stemmer(xapian.Stem(self.language))

        # add fieldname prefixes
        for name, (prefix, valnum, isfilter) in self._fieldmap.iteritems():
            if isfilter:
                qp.add_boolean_prefix(name, prefix)
            else:
                qp.add_prefix(name, prefix)

        return qp
 def process(self, queryString, content):
     # Parse query
     queryParser = xapian.QueryParser()
     queryParser.set_stemmer(xapian.Stem('english'))
     queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
     query = queryParser.parse_query(queryString)
     # Parse content after replacing non-alphanumeric characters with spaces
     queryParser.parse_query(re.sub('\W', ' ', content).lower())
     # Create search pattern
     documentTerms = sum(
         [list(queryParser.unstemlist(x)) for x in set(query)], [])
     if not documentTerms:
         documentTerms = set(query)
     pattern = re.compile(
         r'\b(%s)\b' % '|'.join(re.escape(x) for x in documentTerms),
         re.IGNORECASE)
     # If the user does not want to extract text or there is no queryString,
     if not self.extractLength or not queryString:
         extract = content
     # If the user wants to extract text and there is a queryString,
     else:
         # Initialize
         extractIntervals = []
         extractLengthHalved = self.extractLength / 2
         # For each matchInterval,
         for match in pattern.finditer(content):
             # Prepare
             mStart = max(0, match.start() - extractLengthHalved)
             mEnd = min(len(content), match.end() + extractLengthHalved)
             # Absorb it
             absorbInterval((mStart, mEnd), extractIntervals)
         # Load extracts
         extract = self.joinText.join(content[eStart:eEnd].strip()
                                      for eStart, eEnd in extractIntervals)
     # If the user wants to highlight relevant terms and there is a queryString,
     if self.highlightTemplate and queryString:
         extract = pattern.sub(self.highlightTemplate % r'\1', extract)
     # Return
     return extract
Exemple #12
0
    def _create_query_for_string(self, text, require_all=True, analyzer=None):
        """generate a query for a plain term of a string query

        basically this function parses the string and returns the resulting
        query

        @param text: the query string
        @type text: str
        @param require_all: boolean operator
            (True -> AND (default) / False -> OR)
        @type require_all: bool
        @param analyzer: Define query options (partial matching, exact matching,
            tokenizing, ...) as bitwise combinations of
            CommonIndexer.ANALYZER_???.
            This can override previously defined field analyzer settings.
            If analyzer is None (default), then the configured analyzer for the
            field is used.
        @type analyzer: int
        @return: resulting query object
        @rtype: xapian.Query
        """
        qp = xapian.QueryParser()
        qp.set_database(self.reader)
        if require_all:
            qp.set_default_op(xapian.Query.OP_AND)
        else:
            qp.set_default_op(xapian.Query.OP_OR)
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer & self.ANALYZER_PARTIAL > 0:
            match_flags = xapian.QueryParser.FLAG_PARTIAL
            return qp.parse_query(text, match_flags)
        elif analyzer == self.ANALYZER_EXACT:
            # exact matching -
            return xapian.Query(text)
        else:
            # everything else (not partial and not exact)
            match_flags = 0
            return qp.parse_query(text, match_flags)
Exemple #13
0
    def _create_query_for_field(self, field, value, analyzer=None):
        """generate a field query

        this functions creates a field->value query

        :param field: the fieldname to be used
        :type field: str
        :param value: the wanted value of the field
        :type value: str
        :param analyzer: Define query options (partial matching, exact
                         matching, tokenizing, ...) as bitwise combinations of
                         *CommonIndexer.ANALYZER_???*.

                         This can override previously defined field
                         analyzer settings.

                         If analyzer is None (default), then the configured
                         analyzer for the field is used.
        :type analyzer: int
        :return: the resulting query object
        :rtype: xapian.Query
        """
        if analyzer is None:
            analyzer = self.analyzer
        if analyzer == self.ANALYZER_EXACT:
            # exact matching -> keep special characters
            return xapian.Query("%s%s" % (field.upper(), value))
        # other queries need a parser object
        qp = xapian.QueryParser()
        qp.set_database(self.reader)
        if (analyzer & self.ANALYZER_PARTIAL > 0):
            # partial matching
            match_flags = xapian.QueryParser.FLAG_PARTIAL
            return qp.parse_query(value, match_flags, field.upper())
        else:
            # everything else (not partial and not exact)
            match_flags = 0
            return qp.parse_query(value, match_flags, field.upper())
Exemple #14
0
def search(dbpath, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(dbpath)

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    # Start of prefix configuration.
    queryparser.add_prefix("title", "S")
    queryparser.add_prefix("description", "XD")
    # End of prefix configuration.

    # And parse the query
    query = queryparser.parse_query(querystring)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    # And print out something about each match
    matches = []
    for match in enquire.get_mset(offset, pagesize):
        fields = json.loads(match.document.get_data())
        print(
            u"%(rank)i: #%(docid)3.3i %(title)s" % {
                'rank': match.rank + 1,
                'docid': match.docid,
                'title': fields.get('TITLE', u''),
            })
        matches.append(match.docid)

    # Finally, make sure we log the query and displayed results
    # support.log_matches(querystring, offset, pagesize, matches)
    print(querystring, offset, pagesize, matches)
Exemple #15
0
def find(database, query, sortfield=None, reverse=False, maxnum=100):
    """sort_direction is False for ascending order, True for descending order.
    """

    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("en"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)

    # Add in the termprefixes that are in use
    for tp in iota.TERMPREFIXES:
        queryparser.add_prefix(*tp)

    # And parse the query
    xquery = queryparser.parse_query(query)

    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(database)
    enquire.set_query(xquery)

    # Determine how we're going to sort the results.
    if sortfield is not None:
        try:
            slot = iota.SLOTS[sortfield]
        except KeyError:
            pass
        else:
            enquire.set_sort_by_value_then_relevance(slot, not reverse)

    # And print out something about each match
    logging.info("execute query: {}".format(query))
    mset = enquire.get_mset(0, maxnum)

    # make sexps
    sexps = [match_as_sexp(match) for match in mset]
    sexps.append('(:found {})\n'.format(len(mset)))

    return sexps
def searchDB(queryStr, withContent=False, extractLength=32):
    # Parse Query
    queryParser = xapian.QueryParser()
    queryParser.set_stemmer(xapian.Stem('english'))
    queryParser.set_database(database)
    queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)

    query = queryParser.parse_query(queryStr)

    offset, limit = 0, 5

    # Start Query Session
    enquire = xapian.Enquire(database)
    enquire.set_query(query)

    docIds = list()

    # Display Matches
    matches = enquire.get_mset(offset, limit)
    print('*' * 50)
    for match in matches:
        print('-' * 50)

        pmId = match.document.get_value(0)
        docIds.append(match.document.get_value(0))
        print 'Rank/ID: %s, docID: %s' % (match.rank, pmId)
        print('-' * 50)
        # Process
        content = match.document.get_data()
        extract = TextMachine(extractLength, '*%s*').process(queryStr, content)
        print extract.replace('\n', ' ')
        print('-' * 50)

    print('*' * 50)
    print 'No. of Docs matching Query: %s' % matches.get_matches_estimated()
    print 'No. of Docs Returned: %s' % matches.size()

    return docIds
Exemple #17
0
def test_value_range_processor():
    vp = MultipleValueRangeProcessor(dict(foo=1, bar=2), str.upper)
    assert vp('foo:abc', 'def') == (1, 'ABC', 'DEF')
    assert vp('bar:news', 'def') == (2, 'NEWS', 'DEF')
    assert vp('bar:', 'def') == (2, '', 'DEF')
    assert vp('bar', 'def') == (xapian.BAD_VALUENO, 'bar', 'def')
    assert vp('baz:foo', 'def') == (xapian.BAD_VALUENO, 'baz:foo', 'def')

    qp = xapian.QueryParser()
    db = xodb.temp()
    qp.set_database(db.backend)
    qp.add_valuerangeprocessor(vp)

    query = qp.parse_query('foo:abc..def')
    assert str(query) == 'Xapian::Query(VALUE_RANGE 1 ABC DEF)'

    query = qp.parse_query('bar:abc..def')
    assert str(query) == 'Xapian::Query(VALUE_RANGE 2 ABC DEF)'

    query = qp.parse_query('bar:3..4')
    assert str(query) == 'Xapian::Query(VALUE_RANGE 2 3 4)'

    assert_raises(xapian.QueryParserError, qp.parse_query, 'baz:abc..def')
Exemple #18
0
    def handle_query(self, q):
        database = xapian.Database(self.db_path)
        enquire = xapian.Enquire(database)
        qp = xapian.QueryParser()
        stemmer = xapian.Stem("english")
        qp.set_stemmer(stemmer)
        qp.set_database(database)
        qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
        query = qp.parse_query(q)

        # Find the top 100 results for the query.
        enquire.set_query(query)
        matches = enquire.get_mset(0, 100)

        results = []

        for m in matches:
            context = self.extract_context(m.document.get_data())
            results.append(
                (m.document.get_value(self.DOC_PATH),
                 m.document.get_value(self.DOC_TITLE), ''.join(context)))

        return results
    def search(self, server_guid, store_guid, folder_ids, fields_terms, query, limit_results, log):
        """ handle query; see links in the top for a description of the Xapian API """

        db = self.open_db(server_guid, store_guid, log=log)
        if not db:
            return []

        qp = xapian.QueryParser()
        qp.add_prefix("sourcekey", "XK:")
        qp.add_prefix("folderid", "XF:")
        for fields, terms in fields_terms:
            for field in fields:
                qp.add_prefix('mapi%d' % field, "XM%d:" % field)
        log.info('performing query: %s', query)
        qp.set_database(db)
        query = qp.parse_query(query, xapian.QueryParser.FLAG_BOOLEAN|xapian.QueryParser.FLAG_PHRASE|xapian.QueryParser.FLAG_WILDCARD)
        enquire = xapian.Enquire(db)
        enquire.set_query(query)
        matches = []
        for match in enquire.get_mset(0, limit_results or db.get_doccount()): # XXX catch exception if database is being updated?
            matches.append(match.document.get_value(0).decode('ascii'))
        db.close()
        return matches
Exemple #20
0
    def get_latest_builds(self, package_name):
        enquire = xapian.Enquire(self._versionmap_db)
        qp = xapian.QueryParser()
        qp.set_database(self._versionmap_db)
        qp.add_boolean_prefix('key', 'XA')
        query = qp.parse_query('key:%s' % utils.filter_search_string(package_name))

        enquire.set_query(query)
        matches = enquire.get_mset(0, 1)
        if len(matches) == 0:
            return None
        results = json.loads(matches[0].document.get_data())

        latest_builds = OrderedDict()
        lastdistname = ""

        for dist in distmappings.tags:
            distname = dist['name']
            if lastdistname != distname and distname in results:
                latest_builds[distname] = results[distname]
                lastdistname = distname

        return latest_builds
def search(queryString, byDate=False, ownerID=None, extractLength=32):
    # Parse query string
    queryParser = xapian.QueryParser()
    queryParser.set_stemmer(xapian.Stem('english'))
    queryParser.set_database(database)
    queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = queryParser.parse_query(queryString)
    # Set offset and limit for pagination
    offset, limit = 0, database.get_doccount()
    # Start query session
    enquire = xapian.Enquire(database)
    enquire.set_query(query)
    # Sort by date
    if byDate:
        enquire.set_sort_by_value(xapian_when)
    if ownerID == None:
        matches = enquire.get_mset(offset, limit)
    else:
        # Filter by ownerID
        matches = enquire.get_mset(offset, limit, None, MatchDecider(ownerID))
    # Display matches
    for match in matches:
        # Load
        documentWhen = match.document.get_value(xapian_when)
        # documentOwnerID = int(xapian.sortable_unserialise(match.document.get_value(xapian_owner_id)))
        # Display
        print '==================='
        print 'rank=%s, documentID=%s' % (match.rank, match.docid)
        # Process
        content = match.document.get_data()
        extract = TextMachine(extractLength,
                              '*%s*').process(queryString, content)
        print extract.replace('\n', ' ')
    print '==================='
    print 'Number of documents matching query: %s' % matches.get_matches_estimated(
    )
    print 'Number of documents returned: %s' % matches.size()
Exemple #22
0
def crossquery(querystring, lgs):
    url = 'http://www.galoes.org/grammars/%s/%s'
    width = 0
    #querystring = sys.argv[1]
    d = '/var/wiki'
    #lgs = glob.glob('/var/wiki/*')
    results = []
    for lg in lgs:
        qp = xapian.QueryParser()
        try:
            database = xapian.Database('/var/wiki/xapian/%s/index' % lg)
        except:
            #print "no index found for ", lg
            continue
        qp.set_database(database)
        msc = MoinSearchConnection(database)
        qresults = msc.get_all_documents(query=qp.parse_query(querystring))
        for r in qresults:
            wikiname = r.data['title'][0]
            wikinamefs = wikiname.replace(' ', '(20)').replace(',', '(2c)')
            try:
                revnr = open('%s/%s/pasges/%s/current' %
                             (d, lg, wikinamefs)).read().strip()
                content = open('%s/%s/pages/%s/revisions/%s' %
                               (d, lg, wikinamefs, revnr)).read()
            except IOError:
                print "File not Found", wikinamefs
                continue
            matches = re.findall(
                '%s%s%s' % (width * '.', querystring.lower(), width * '.'),
                content.lower())
            results.append({
                'link': url % (lg, wikinamefs),
                'matchcount': len(matches),
                'matches': matches,
            })
    return result
Exemple #23
0
def search(xap_db_path, querystring, offset=0, pagesize=10):
    # offset - defines starting point within result set
    # pagesize - defines number of records to retrieve

    # Open the database we're going to search.
    db = xapian.Database(xap_db_path)
    # Set up a QueryParser with a stemmer and suitable prefixes
    queryparser = xapian.QueryParser()
    queryparser.set_stemmer(xapian.Stem("swedish"))
    queryparser.set_stemming_strategy(queryparser.STEM_SOME)
    queryparser.add_prefix("food", "XFOOD")

    xapian_flags = xapian.QueryParser.FLAG_WILDCARD | xapian.QueryParser.FLAG_BOOLEAN | \
                        xapian.QueryParser.FLAG_SYNONYM | xapian.QueryParser.FLAG_BOOLEAN_ANY_CASE | \
                        xapian.QueryParser.FLAG_LOVEHATE | xapian.Query.OP_PHRASE

    # And parse the query
    queryparser.set_database(db)
    query = queryparser.parse_query(querystring, xapian_flags)
    # Use an Enquire object on the database to run the query
    enquire = xapian.Enquire(db)
    enquire.set_query(query)

    matches = []
    for match in enquire.get_mset(offset, pagesize):
        s_fields = match.document.get_data()
        fields = unicode(s_fields, 'utf-8')
        #        print u"%(rank)i: #%(docid)3.3i %(data)s" % {
        #        'rank': match.rank + 1,
        #        'docid': match.docid,
        #        'data': fields
        #        }
        #        part = [match.rank,  match.docid,  fields]
        matches.append(fields)

    return matches
Exemple #24
0
def get_full_text_matches(database, full_text_query, offset, limit,
                          match_decider):
    u"""Does the actual full-text search with Xapian.

    :Parameters:
      - `database`: the name of the RefDB database
      - `full_text_query`: the raw query string for the full text search; must
        not be empty
      - `offset`: offset of the returned hits within the complete hits list
      - `limit`: maximal number of returned hits
      - `match_decider`: Xapian match decider object, e.g. for taking the other
        search parameters into account

    :type database: unicode
    :type full_text_query: unicode
    :type offset: int
    :type limit: int
    :type match_decider: `MatchDecider`

    :Return:
      the found matches

    :rtype: ``Xapian.MSet``
    """
    database = xapian.Database(
        os.path.join("/var/lib/django_refdb_indices", database))
    enquire = xapian.Enquire(database)
    #    enquire.set_collapse_key(0)
    query_parser = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    query_parser.set_stemmer(stemmer)
    query_parser.set_database(database)
    query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = query_parser.parse_query(full_text_query)
    enquire.set_query(query)
    return enquire.get_mset(offset, limit, None, match_decider)
Exemple #25
0
def crossquery(querystring, lg):
    url = 'http://www.galoes.org/grammars/%s/%s'
    width = 20

    d = '/var/wiki'
    results = []
    qp = xapian.QueryParser()
    database = xapian.Database('/var/wiki/xapian/%s/index' % lg)
    qp.set_database(database)
    msc = MoinSearchConnection(database)
    qresults = msc.get_all_documents(query=qp.parse_query(querystring))

    for r in qresults:
        wikiname = r.data['title'][0]
        wikinamefs = wikiutil.quoteWikinameFS(wikiname)
        try:
            refv = '%s/%s/pages/%s/current' % (d, lg, wikinamefs)
            revnr = open(refv).read().strip()
            contentf = '%s/%s/pages/%s/revisions/%s' % (d, lg, wikinamefs,
                                                        revnr)
            content = open(contentf).read().decode('utf8')
        except IOError:
            print "File not Found", wikinamefs
            continue

        matches = re.findall(
            u'%s%s%s' % ('.' + '{,%i}' % width, querystring.lower(),
                         '.' + '{,%i}' % width), content.lower())
        print matches
        results.append({
            'link': url % (lg, wikinamefs),
            'totalmatches': len(matches),
            'matches': matches,
            'name': wikiname,
        })
    return results
Exemple #26
0
def test_userstem():
    mystem = MyStemmer()
    stem = xapian.Stem(mystem)
    expect(stem(b'test'), b'tst')
    stem2 = xapian.Stem(mystem)
    expect(stem2(b'toastie'), b'tst')

    indexer = xapian.TermGenerator()
    indexer.set_stemmer(xapian.Stem(MyStemmer()))

    doc = xapian.Document()
    indexer.set_document(doc)
    indexer.index_text(b'hello world')

    s = '/'
    for t in doc.termlist():
        s += t.term.decode('utf-8')
        s += '/'
    expect(s, '/Zhll/Zwrld/hello/world/')

    parser = xapian.QueryParser()
    parser.set_stemmer(xapian.Stem(MyStemmer()))
    parser.set_stemming_strategy(xapian.QueryParser.STEM_ALL)
    expect_query(parser.parse_query(b'color television'), '(clr@1 OR tlvsn@2)')
Exemple #27
0
from softwarecenter.enums import XapianValues
import softwarecenter.paths

# this is not a test as such, more a example of how xapian search
# work and useful features around them

if __name__ == "__main__":

    if len(sys.argv) > 1:
        search_term = sys.argv[1]
    else:
        search_term = "app"

    db = xapian.Database(softwarecenter.paths.XAPIAN_PATH)

    parser = xapian.QueryParser()
    #parser.set_stemmer(xapian.Stem("english"))
    #parser.set_stemming_strategy(xapian.QueryParser.STEM_ALL)
    parser.set_database(db)
    #parser.add_prefix("pkg", "AP")
    query = parser.parse_query(
        search_term,
        xapian.QueryParser.FLAG_PARTIAL | xapian.QueryParser.FLAG_WILDCARD)

    enquire = xapian.Enquire(db)
    enquire.set_sort_by_value_then_relevance(XapianValues.POPCON)
    enquire.set_query(query)
    matches = enquire.get_mset(0, db.get_doccount())
    print "Matches:"
    for m in matches:
        doc = m.document
Exemple #28
0
def get_query_from_search_entry(search_term):
    if not search_term:
        return xapian.Query("")
    parser = xapian.QueryParser()
    user_query = parser.parse_query(search_term)
    return user_query
Exemple #29
0
    while index < len(sys.argv):
        arg = sys.argv[index]
        index += 1
        if arg == '--':
            # Passed marker, move to parsing relevant docids.
            break
        query_string += ' '
        query_string += arg

    # Create an RSet with the listed docids in.
    reldocs = xapian.RSet()
    for index in range(index, len(sys.argv)):
        reldocs.add_document(int(sys.argv[index]))

    # Parse the query string to produce a Xapian::Query object.
    qp = xapian.QueryParser()
    stemmer = xapian.Stem("english")
    qp.set_stemmer(stemmer)
    qp.set_database(database)
    qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
    query = qp.parse_query(query_string)

    if not query.empty():
        print("Parsed query is: %s" % str(query))

        # Find the top 10 results for the query.
        enquire.set_query(query)
        matches = enquire.get_mset(0, 10, reldocs)

        # Display the results.
        print("%i results found." % matches.get_matches_estimated())
Exemple #30
0
def test_all():
    # Test the version number reporting functions give plausible results.
    v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(),
                      xapian.revision())
    v2 = xapian.version_string()
    expect(v2, v, "Unexpected version output")

    # A regexp check would be better, but seems to create a bogus "leak" of -1
    # objects in Python 3.
    expect(len(xapian.__version__.split('.')), 3,
           'xapian.__version__ not X.Y.Z')
    expect((xapian.__version__.split('.'))[0], '1',
           'xapian.__version__ not "1.Y.Z"')

    def access_cvar():
        res = xapian.cvar
        print("Unhandled constants: ", res)
        return res

    # Check that SWIG isn't generating cvar (regression test for ticket#297).
    #
    # Python 3.5 generates a different exception message here to earlier
    # versions, so we need a check which matches both.
    expect_exception(AttributeError,
                     lambda msg: msg.find("has no attribute 'cvar'") != -1,
                     access_cvar)

    stem = xapian.Stem(b"english")
    expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)")

    doc = xapian.Document()
    doc.set_data(b"a\0b")
    if doc.get_data() == b"a":
        raise TestFail("get_data+set_data truncates at a zero byte")
    expect(doc.get_data(), b"a\0b",
           "get_data+set_data doesn't transparently handle a zero byte")
    doc.set_data(b"is there anybody out there?")
    doc.add_term(b"XYzzy")
    doc.add_posting(stem(b"is"), 1)
    doc.add_posting(stem(b"there"), 2)
    doc.add_posting(stem(b"anybody"), 3)
    doc.add_posting(stem(b"out"), 4)
    doc.add_posting(stem(b"there"), 5)

    db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY)
    db.add_document(doc)
    expect(db.get_doccount(), 1, "Unexpected db.get_doccount()")
    terms = ["smoke", "test", "terms"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]),
        "(smoke OR test OR terms)")
    query1 = xapian.Query(xapian.Query.OP_PHRASE,
                          (b"smoke", b"test", b"tuple"))
    query2 = xapian.Query(xapian.Query.OP_XOR,
                          (xapian.Query(b"smoke"), query1, b"string"))
    expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)")
    expect_query(
        query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)")
    subqs = ["a", "b"]
    expect_query(
        xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]),
        "(a OR b)")
    expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'),
                 "VALUE_RANGE 0 1 4")

    # Check database factory functions are wrapped as expected (or not wrapped
    # in the first cases):

    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb"))
    expect_exception(
        AttributeError,
        lambda msg: msg.find("has no attribute 'open_stub'") != -1,
        lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN))

    expect_exception(
        xapian.DatabaseOpeningError, None,
        lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB))
    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB))

    expect_exception(
        xapian.DatabaseOpeningError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS))
    expect_exception(
        xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS))

    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.Database(
            b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT))
    expect_exception(
        xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase(
            b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT))

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"/bin/false", b"")
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"/bin/false", b"")

    expect_exception(xapian.NetworkError, None, xapian.remote_open,
                     b"127.0.0.1", 0, 1)
    expect_exception(xapian.NetworkError, None, xapian.remote_open_writable,
                     b"127.0.0.1", 0, 1)

    # Check wrapping of MatchAll and MatchNothing:

    expect_query(xapian.Query.MatchAll, "<alldocuments>")
    expect_query(xapian.Query.MatchNothing, "")

    # Feature test for Query.__iter__
    term_count = 0
    for term in query2:
        term_count += 1
    expect(term_count, 4, "Unexpected number of terms in query2")

    enq = xapian.Enquire(db)
    enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is"))
    mset = enq.get_mset(0, 10)
    expect(mset.size(), 1, "Unexpected mset.size()")
    expect(len(mset), 1, "Unexpected mset.size()")

    # Feature test for Enquire.matching_terms(docid)
    term_count = 0
    for term in enq.matching_terms(mset.get_hit(0)):
        term_count += 1
    expect(term_count, 2, "Unexpected number of matching terms")

    # Feature test for MSet.__iter__
    msize = 0
    for match in mset:
        msize += 1
    expect(msize, mset.size(), "Unexpected number of entries in mset")

    terms = b" ".join(enq.matching_terms(mset.get_hit(0)))
    expect(terms, b"is there", "Unexpected terms")

    # Feature test for ESet.__iter__
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enq.get_eset(10, rset)
    term_count = 0
    for term in eset:
        term_count += 1
    expect(term_count, 3, "Unexpected number of expand terms")

    # Feature test for Database.__iter__
    term_count = 0
    for term in db:
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db")

    # Feature test for Database.allterms
    term_count = 0
    for term in db.allterms():
        term_count += 1
    expect(term_count, 5, "Unexpected number of terms in db.allterms")

    # Feature test for Database.postlist
    count = 0
    for posting in db.postlist(b"there"):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('there')")

    # Feature test for Database.postlist with empty term (alldocspostlist)
    count = 0
    for posting in db.postlist(b""):
        count += 1
    expect(count, 1, "Unexpected number of entries in db.postlist('')")

    # Feature test for Database.termlist
    count = 0
    for term in db.termlist(1):
        count += 1
    expect(count, 5, "Unexpected number of entries in db.termlist(1)")

    # Feature test for Database.positionlist
    count = 0
    for term in db.positionlist(1, b"there"):
        count += 1
    expect(count, 2,
           "Unexpected number of entries in db.positionlist(1, 'there')")

    # Feature test for Document.termlist
    count = 0
    for term in doc.termlist():
        count += 1
    expect(count, 5, "Unexpected number of entries in doc.termlist()")

    # Feature test for TermIter.skip_to
    term = doc.termlist()
    term.skip_to(b'n')
    while True:
        try:
            x = next(term)
        except StopIteration:
            break
        if x.term < b'n':
            raise TestFail("TermIter.skip_to didn't skip term '%s'" %
                           x.term.decode('utf-8'))

    # Feature test for Document.values
    count = 0
    for term in list(doc.values()):
        count += 1
    expect(count, 0, "Unexpected number of entries in doc.values")

    # Check exception handling for Xapian::DocNotFoundError
    expect_exception(xapian.DocNotFoundError, "Docid 3 not found",
                     db.get_document, 3)

    # Check value of OP_ELITE_SET
    expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET")

    # Feature test for MatchDecider
    doc = xapian.Document()
    doc.set_data(b"Two")
    doc.add_posting(stem(b"out"), 1)
    doc.add_posting(stem(b"outside"), 1)
    doc.add_posting(stem(b"source"), 2)
    doc.add_value(0, b"yes")
    db.add_document(doc)

    class testmatchdecider(xapian.MatchDecider):
        def __call__(self, doc):
            return doc.get_value(0) == b"yes"

    query = xapian.Query(stem(b"out"))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    mset = enquire.get_mset(0, 10, None, testmatchdecider())
    expect(mset.size(), 1,
           "Unexpected number of documents returned by match decider")
    expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in")

    # Feature test for ExpandDecider
    class testexpanddecider(xapian.ExpandDecider):
        def __call__(self, term):
            return (not term.startswith(b'a'))

    enquire = xapian.Enquire(db)
    rset = xapian.RSet()
    rset.add_document(1)
    eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            testexpanddecider())
    eset_terms = [item.term for item in eset]
    expect(len(eset_terms), eset.size(),
           "Unexpected number of terms returned by expand")
    if [t for t in eset_terms if t.startswith(b'a')]:
        raise TestFail("ExpandDecider was not used")

    # Check min_wt argument to get_eset() works (new in 1.2.5).
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ)
    expect([i.weight for i in eset][-1] < 1.9, True,
           "test get_eset() without min_wt")
    eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0,
                            None, 1.9)
    expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt")

    # Check QueryParser parsing error.
    qp = xapian.QueryParser()
    expect_exception(xapian.QueryParserError,
                     "Syntax: <expression> AND <expression>", qp.parse_query,
                     b"test AND")

    # Check QueryParser pure NOT option
    qp = xapian.QueryParser()
    expect_query(
        qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT test@1)")

    # Check QueryParser partial option
    qp = xapian.QueryParser()
    qp.set_database(db)
    qp.set_default_op(xapian.Query.OP_AND)
    qp.set_stemming_strategy(qp.STEM_SOME)
    qp.set_stemmer(xapian.Stem(b'en'))
    expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))")

    expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL),
                 "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))")

    # Test supplying unicode strings
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')),
                 '(foo OR bar)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')),
                 '(foo OR bar\\xa3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')),
                 '(foo OR bar\u00a3)')
    expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'),
                 '(foo OR bar)')

    expect_query(
        qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT),
        "(<alldocuments> AND_NOT Zt\u00e9st@1)")

    doc = xapian.Document()
    doc.set_data(b"Unicode with an acc\xe9nt")
    doc.add_posting(stem(b"out\xe9r"), 1)
    expect(doc.get_data(), b"Unicode with an acc\xe9nt")
    term = next(doc.termlist()).term
    expect(term, b"out\xe9r")

    # Check simple stopper
    stop = xapian.SimpleStopper()
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    stop.add(b'a')
    expect(stop(b'a'), True)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Feature test for custom Stopper
    class my_b_stopper(xapian.Stopper):
        def __call__(self, term):
            return term == b"b"

        def get_description(self):
            return "my_b_stopper"

    stop = my_b_stopper()
    expect(stop.get_description(), "my_b_stopper")
    qp.set_stopper(stop)
    expect(stop(b'a'), False)
    expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2 AND Za@3)")

    expect(stop(b'b'), True)
    expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN),
                 "(Zfoo@1 AND Zbar@2)")

    # Test TermGenerator
    termgen = xapian.TermGenerator()
    doc = xapian.Document()
    termgen.set_document(doc)
    termgen.index_text(b'foo bar baz foo')
    expect([(item.term, item.wdf, [pos for pos in item.positer])
            for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]),
                                          (b'foo', 2, [1, 4])])

    # Check DateValueRangeProcessor works
    context("checking that DateValueRangeProcessor works")
    qp = xapian.QueryParser()
    vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960)
    qp.add_valuerangeprocessor(vrpdate)
    query = qp.parse_query(b'12/03/99..12/04/01')
    expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)')

    # Regression test for bug#193, fixed in 1.0.3.
    context("running regression test for bug#193")
    vrp = xapian.NumberValueRangeProcessor(0, b'$', True)
    a = '$10'
    b = '20'
    slot, a, b = vrp(a, b.encode('utf-8'))
    expect(slot, 0)
    expect(xapian.sortable_unserialise(a), 10)
    expect(xapian.sortable_unserialise(b), 20)

    # Feature test for xapian.FieldProcessor
    context("running feature test for xapian.FieldProcessor")

    class testfieldprocessor(xapian.FieldProcessor):
        def __call__(self, s):
            if s == 'spam':
                raise Exception('already spam')
            return xapian.Query("spam")

    qp.add_prefix('spam', testfieldprocessor())
    qp.add_boolean_prefix('boolspam', testfieldprocessor())
    query = qp.parse_query('spam:ignored')
    expect(str(query), 'Query(spam)')

    # FIXME: This doesn't currently work:
    # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam')

    # Regression tests copied from PHP (probably always worked in python, but
    # let's check...)
    context("running regression tests for issues which were found in PHP")

    # PHP overload resolution involving boolean types failed.
    enq.set_sort_by_value(1, True)

    # Regression test - fixed in 0.9.10.1.
    oqparser = xapian.QueryParser()
    oquery = oqparser.parse_query(b"I like tea")

    # Regression test for bug#192 - fixed in 1.0.3.
    enq.set_cutoff(100)

    # Test setting and getting metadata
    expect(db.get_metadata(b'Foo'), b'')
    db.set_metadata(b'Foo', b'Foo')
    expect(db.get_metadata(b'Foo'), b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.set_metadata, b'',
                     b'Foo')
    expect_exception(xapian.InvalidArgumentError,
                     "Empty metadata keys are invalid", db.get_metadata, b'')

    # Test OP_SCALE_WEIGHT and corresponding constructor
    expect_query(
        xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5),
        "5 * foo")