def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stem_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # Start of example code. enquire.set_sort_by_value_then_relevance(1, False) # End of example code. # And print out something about each match matches = [] for index, match in enumerate(enquire.get_mset(offset, pagesize)): fields = json.loads(match.document.get_data()) print u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 'rank': offset + index + 1, 'docid': match.docid, 'name': fields.get('name', u''), 'date': fields.get('admitted', u''), 'pop': fields.get('population', u''), 'lat': fields.get('latitude', u''), 'lon': fields.get('longitude', u''), } matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stem_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # and add in value range processors queryparser.add_valuerangeprocessor( xapian.NumberValueRangeProcessor(0, 'mm', False)) queryparser.add_valuerangeprocessor(xapian.NumberValueRangeProcessor( 1, '')) # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) print u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'measurements': fields.get('MEASUREMENTS', u''), 'date': fields.get('DATE_MADE', u''), 'title': fields.get('TITLE', u''), } matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def reload_database(self): # {{{ # create the xapian handlers self.db = xapian.WritableDatabase(self.database, xapian.DB_CREATE_OR_OPEN) self.qp = xapian.QueryParser() self.qp.set_stemmer(xapian.Stem(self.language)) self.qp.set_stemming_strategy(self.qp.STEM_SOME) self.qp.add_prefix("title", "S") self.tg = xapian.TermGenerator() self.tg.set_stemmer(xapian.Stem(self.language)) try: self.tg.set_stemming_strategy(self.tg.STEM_SOME) except AttributeError: pass self.e = xapian.Enquire(self.db) self.sorted_e = xapian.Enquire(self.db) # Value 2 is the lowercase form of the title self.sorted_e.set_sort_by_value(2, False)
def _search(self, text, pagenum=1, limit=10, isPath=False): database = self._index.index # Start an enquire session. enquire = xapian.Enquire(database) # Parse the query string to produce a Xapian::Query object. self.parser = xapian.QueryParser() self.parser.set_stemmer(xapian.Stem("english")) self.parser.set_database(database) self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.query = self.parser.parse_query(text, DEFAULT_SEARCH_FLAGS) # find using the parsed query enquire.set_query(self.query) offset = pagenum * limit - limit matches = enquire.get_mset(offset, limit) # build results results = XapianResults(self, matches, total_count=matches.get_matches_estimated(), pagenum=pagenum, limit=limit) return results
def __init__(self, dbpath='/home/mirage/Downloads/weiboxa/simple'): database = xapian.Database(dbpath) enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.qp = qp self.enquire = enquire self.emotionvi = 0 self.keywordsvi = 1 self.timestampvi = 2 self.loctvi = 3 self.reploctvi = 4 self.emotiononlyvi = 5 #usernamevi = 6 self.hashtagsvi = 7 #uidvi = 8 #repnameslistvi = 9 self.maxitems = 1000000000
def enquire(querystring, versions=None): try: db = xapian.Database(settings.XAPIAN_DB_PATH) except xapian.DatabaseOpeningError: return None qp = xapian.QueryParser() qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem("en")) qp.set_database(db) query = qp.parse_query(querystring, qp.FLAG_PARTIAL) if versions: query = xapian.Query(xapian.Query.OP_FILTER, query, make_version_queries(versions)) enquiry = xapian.Enquire(db) enquiry.set_query(query) return db, enquiry
def search(dbpath, querystring, option=0, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrive db = xapian.Database(dbpath) queryparser = xapian.QueryParser() # choose a language queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") query = queryparser.parse_query(querystring) enquire = xapian.Enquire(db) #select different weighting schema bm = select_weight(option) enquire.set_weighting_scheme(bm) enquire.set_query(query) matches = [] ret = "" for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) tmp = u"%(rank)i: #%(docid)3.3i %(title)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'title': fields.get('TITLE', u''), } ret += tmp ret += '\n' matches.append(match.docid) support.log_matches(querystring, offset, pagesize, matches) return ret ### END of function
def xapian_get_ranked_index(index, pattern, params, hitset, ranked_result_amount): """ Queries a Xapian index. Returns: a list of ranked record ids [(recid, score), ...) contained in hitset and an intbitset of record ids contained in hitset. """ result = [] matched_recs = intbitset() database = DATABASES[index] enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) # Avoids phrase search to increase performance if "avoid_phrase_search_threshold" in params and len(hitset) >= params[ "avoid_phrase_search_threshold"] and pattern.startswith('"'): pattern = pattern[1:-1] query_string = ' AND '.join(pattern.split(' ')) pattern = qp.parse_query(query_string) else: query_string = pattern pattern = qp.parse_query(query_string, xapian.QueryParser.FLAG_PHRASE) enquire.set_query(pattern) matches = enquire.get_mset(0, ranked_result_amount, None, MatchDecider(hitset)) weight = params["weight"] for match in matches: recid = match.docid if recid in hitset: score = int(match.percent) * weight result.append((recid, score)) matched_recs.add(recid) return (result, matched_recs)
def search(query, active_element, numresults=10): # Put active_element at end of query qfields = sorted(query, key=lambda k: k == active_element) # XXX There should be a way to do this without going through an # intermediate string, and without adding prefixes. qvalues = [(k, e) for k in qfields for e in query[k].split()] qstring = ['%s:%s' % (field, value) for field, value in qvalues] querystring = ' AND '.join(qstring) db = xapian.Database(dbpath) queryparser = xapian.QueryParser() queryparser.set_database(db) for field, abbrev in fields.items(): queryparser.add_prefix(field, abbrev) query = queryparser.parse_query(querystring, queryparser.FLAG_BOOLEAN | queryparser.FLAG_PARTIAL | queryparser.FLAG_WILDCARD) enquire = xapian.Enquire(db) enquire.set_weighting_scheme(xapian.BoolWeight()) enquire.set_query(query) return [json.loads(r.document.get_data()) for r in enquire.get_mset(0, numresults)]
def query_parser(self, database=None): """Return a xapian.QueryParser object set up for this fieldmap. `database` should be supplied if you want to use spelling correction etc. """ qp = xapian.QueryParser() if database: qp.set_database(database) if self.language: qp.set_stemmer(xapian.Stem(self.language)) # add fieldname prefixes for name, (prefix, valnum, isfilter) in self._fieldmap.iteritems(): if isfilter: qp.add_boolean_prefix(name, prefix) else: qp.add_prefix(name, prefix) return qp
def process(self, queryString, content): # Parse query queryParser = xapian.QueryParser() queryParser.set_stemmer(xapian.Stem('english')) queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = queryParser.parse_query(queryString) # Parse content after replacing non-alphanumeric characters with spaces queryParser.parse_query(re.sub('\W', ' ', content).lower()) # Create search pattern documentTerms = sum( [list(queryParser.unstemlist(x)) for x in set(query)], []) if not documentTerms: documentTerms = set(query) pattern = re.compile( r'\b(%s)\b' % '|'.join(re.escape(x) for x in documentTerms), re.IGNORECASE) # If the user does not want to extract text or there is no queryString, if not self.extractLength or not queryString: extract = content # If the user wants to extract text and there is a queryString, else: # Initialize extractIntervals = [] extractLengthHalved = self.extractLength / 2 # For each matchInterval, for match in pattern.finditer(content): # Prepare mStart = max(0, match.start() - extractLengthHalved) mEnd = min(len(content), match.end() + extractLengthHalved) # Absorb it absorbInterval((mStart, mEnd), extractIntervals) # Load extracts extract = self.joinText.join(content[eStart:eEnd].strip() for eStart, eEnd in extractIntervals) # If the user wants to highlight relevant terms and there is a queryString, if self.highlightTemplate and queryString: extract = pattern.sub(self.highlightTemplate % r'\1', extract) # Return return extract
def _create_query_for_string(self, text, require_all=True, analyzer=None): """generate a query for a plain term of a string query basically this function parses the string and returns the resulting query @param text: the query string @type text: str @param require_all: boolean operator (True -> AND (default) / False -> OR) @type require_all: bool @param analyzer: Define query options (partial matching, exact matching, tokenizing, ...) as bitwise combinations of CommonIndexer.ANALYZER_???. This can override previously defined field analyzer settings. If analyzer is None (default), then the configured analyzer for the field is used. @type analyzer: int @return: resulting query object @rtype: xapian.Query """ qp = xapian.QueryParser() qp.set_database(self.reader) if require_all: qp.set_default_op(xapian.Query.OP_AND) else: qp.set_default_op(xapian.Query.OP_OR) if analyzer is None: analyzer = self.analyzer if analyzer & self.ANALYZER_PARTIAL > 0: match_flags = xapian.QueryParser.FLAG_PARTIAL return qp.parse_query(text, match_flags) elif analyzer == self.ANALYZER_EXACT: # exact matching - return xapian.Query(text) else: # everything else (not partial and not exact) match_flags = 0 return qp.parse_query(text, match_flags)
def _create_query_for_field(self, field, value, analyzer=None): """generate a field query this functions creates a field->value query :param field: the fieldname to be used :type field: str :param value: the wanted value of the field :type value: str :param analyzer: Define query options (partial matching, exact matching, tokenizing, ...) as bitwise combinations of *CommonIndexer.ANALYZER_???*. This can override previously defined field analyzer settings. If analyzer is None (default), then the configured analyzer for the field is used. :type analyzer: int :return: the resulting query object :rtype: xapian.Query """ if analyzer is None: analyzer = self.analyzer if analyzer == self.ANALYZER_EXACT: # exact matching -> keep special characters return xapian.Query("%s%s" % (field.upper(), value)) # other queries need a parser object qp = xapian.QueryParser() qp.set_database(self.reader) if (analyzer & self.ANALYZER_PARTIAL > 0): # partial matching match_flags = xapian.QueryParser.FLAG_PARTIAL return qp.parse_query(value, match_flags, field.upper()) else: # everything else (not partial and not exact) match_flags = 0 return qp.parse_query(value, match_flags, field.upper())
def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) # Start of prefix configuration. queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # End of prefix configuration. # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) print( u"%(rank)i: #%(docid)3.3i %(title)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'title': fields.get('TITLE', u''), }) matches.append(match.docid) # Finally, make sure we log the query and displayed results # support.log_matches(querystring, offset, pagesize, matches) print(querystring, offset, pagesize, matches)
def find(database, query, sortfield=None, reverse=False, maxnum=100): """sort_direction is False for ascending order, True for descending order. """ # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) # Add in the termprefixes that are in use for tp in iota.TERMPREFIXES: queryparser.add_prefix(*tp) # And parse the query xquery = queryparser.parse_query(query) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(database) enquire.set_query(xquery) # Determine how we're going to sort the results. if sortfield is not None: try: slot = iota.SLOTS[sortfield] except KeyError: pass else: enquire.set_sort_by_value_then_relevance(slot, not reverse) # And print out something about each match logging.info("execute query: {}".format(query)) mset = enquire.get_mset(0, maxnum) # make sexps sexps = [match_as_sexp(match) for match in mset] sexps.append('(:found {})\n'.format(len(mset))) return sexps
def searchDB(queryStr, withContent=False, extractLength=32): # Parse Query queryParser = xapian.QueryParser() queryParser.set_stemmer(xapian.Stem('english')) queryParser.set_database(database) queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = queryParser.parse_query(queryStr) offset, limit = 0, 5 # Start Query Session enquire = xapian.Enquire(database) enquire.set_query(query) docIds = list() # Display Matches matches = enquire.get_mset(offset, limit) print('*' * 50) for match in matches: print('-' * 50) pmId = match.document.get_value(0) docIds.append(match.document.get_value(0)) print 'Rank/ID: %s, docID: %s' % (match.rank, pmId) print('-' * 50) # Process content = match.document.get_data() extract = TextMachine(extractLength, '*%s*').process(queryStr, content) print extract.replace('\n', ' ') print('-' * 50) print('*' * 50) print 'No. of Docs matching Query: %s' % matches.get_matches_estimated() print 'No. of Docs Returned: %s' % matches.size() return docIds
def test_value_range_processor(): vp = MultipleValueRangeProcessor(dict(foo=1, bar=2), str.upper) assert vp('foo:abc', 'def') == (1, 'ABC', 'DEF') assert vp('bar:news', 'def') == (2, 'NEWS', 'DEF') assert vp('bar:', 'def') == (2, '', 'DEF') assert vp('bar', 'def') == (xapian.BAD_VALUENO, 'bar', 'def') assert vp('baz:foo', 'def') == (xapian.BAD_VALUENO, 'baz:foo', 'def') qp = xapian.QueryParser() db = xodb.temp() qp.set_database(db.backend) qp.add_valuerangeprocessor(vp) query = qp.parse_query('foo:abc..def') assert str(query) == 'Xapian::Query(VALUE_RANGE 1 ABC DEF)' query = qp.parse_query('bar:abc..def') assert str(query) == 'Xapian::Query(VALUE_RANGE 2 ABC DEF)' query = qp.parse_query('bar:3..4') assert str(query) == 'Xapian::Query(VALUE_RANGE 2 3 4)' assert_raises(xapian.QueryParserError, qp.parse_query, 'baz:abc..def')
def handle_query(self, q): database = xapian.Database(self.db_path) enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(q) # Find the top 100 results for the query. enquire.set_query(query) matches = enquire.get_mset(0, 100) results = [] for m in matches: context = self.extract_context(m.document.get_data()) results.append( (m.document.get_value(self.DOC_PATH), m.document.get_value(self.DOC_TITLE), ''.join(context))) return results
def search(self, server_guid, store_guid, folder_ids, fields_terms, query, limit_results, log): """ handle query; see links in the top for a description of the Xapian API """ db = self.open_db(server_guid, store_guid, log=log) if not db: return [] qp = xapian.QueryParser() qp.add_prefix("sourcekey", "XK:") qp.add_prefix("folderid", "XF:") for fields, terms in fields_terms: for field in fields: qp.add_prefix('mapi%d' % field, "XM%d:" % field) log.info('performing query: %s', query) qp.set_database(db) query = qp.parse_query(query, xapian.QueryParser.FLAG_BOOLEAN|xapian.QueryParser.FLAG_PHRASE|xapian.QueryParser.FLAG_WILDCARD) enquire = xapian.Enquire(db) enquire.set_query(query) matches = [] for match in enquire.get_mset(0, limit_results or db.get_doccount()): # XXX catch exception if database is being updated? matches.append(match.document.get_value(0).decode('ascii')) db.close() return matches
def get_latest_builds(self, package_name): enquire = xapian.Enquire(self._versionmap_db) qp = xapian.QueryParser() qp.set_database(self._versionmap_db) qp.add_boolean_prefix('key', 'XA') query = qp.parse_query('key:%s' % utils.filter_search_string(package_name)) enquire.set_query(query) matches = enquire.get_mset(0, 1) if len(matches) == 0: return None results = json.loads(matches[0].document.get_data()) latest_builds = OrderedDict() lastdistname = "" for dist in distmappings.tags: distname = dist['name'] if lastdistname != distname and distname in results: latest_builds[distname] = results[distname] lastdistname = distname return latest_builds
def search(queryString, byDate=False, ownerID=None, extractLength=32): # Parse query string queryParser = xapian.QueryParser() queryParser.set_stemmer(xapian.Stem('english')) queryParser.set_database(database) queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = queryParser.parse_query(queryString) # Set offset and limit for pagination offset, limit = 0, database.get_doccount() # Start query session enquire = xapian.Enquire(database) enquire.set_query(query) # Sort by date if byDate: enquire.set_sort_by_value(xapian_when) if ownerID == None: matches = enquire.get_mset(offset, limit) else: # Filter by ownerID matches = enquire.get_mset(offset, limit, None, MatchDecider(ownerID)) # Display matches for match in matches: # Load documentWhen = match.document.get_value(xapian_when) # documentOwnerID = int(xapian.sortable_unserialise(match.document.get_value(xapian_owner_id))) # Display print '===================' print 'rank=%s, documentID=%s' % (match.rank, match.docid) # Process content = match.document.get_data() extract = TextMachine(extractLength, '*%s*').process(queryString, content) print extract.replace('\n', ' ') print '===================' print 'Number of documents matching query: %s' % matches.get_matches_estimated( ) print 'Number of documents returned: %s' % matches.size()
def crossquery(querystring, lgs): url = 'http://www.galoes.org/grammars/%s/%s' width = 0 #querystring = sys.argv[1] d = '/var/wiki' #lgs = glob.glob('/var/wiki/*') results = [] for lg in lgs: qp = xapian.QueryParser() try: database = xapian.Database('/var/wiki/xapian/%s/index' % lg) except: #print "no index found for ", lg continue qp.set_database(database) msc = MoinSearchConnection(database) qresults = msc.get_all_documents(query=qp.parse_query(querystring)) for r in qresults: wikiname = r.data['title'][0] wikinamefs = wikiname.replace(' ', '(20)').replace(',', '(2c)') try: revnr = open('%s/%s/pasges/%s/current' % (d, lg, wikinamefs)).read().strip() content = open('%s/%s/pages/%s/revisions/%s' % (d, lg, wikinamefs, revnr)).read() except IOError: print "File not Found", wikinamefs continue matches = re.findall( '%s%s%s' % (width * '.', querystring.lower(), width * '.'), content.lower()) results.append({ 'link': url % (lg, wikinamefs), 'matchcount': len(matches), 'matches': matches, }) return result
def search(xap_db_path, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(xap_db_path) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("swedish")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) queryparser.add_prefix("food", "XFOOD") xapian_flags = xapian.QueryParser.FLAG_WILDCARD | xapian.QueryParser.FLAG_BOOLEAN | \ xapian.QueryParser.FLAG_SYNONYM | xapian.QueryParser.FLAG_BOOLEAN_ANY_CASE | \ xapian.QueryParser.FLAG_LOVEHATE | xapian.Query.OP_PHRASE # And parse the query queryparser.set_database(db) query = queryparser.parse_query(querystring, xapian_flags) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) matches = [] for match in enquire.get_mset(offset, pagesize): s_fields = match.document.get_data() fields = unicode(s_fields, 'utf-8') # print u"%(rank)i: #%(docid)3.3i %(data)s" % { # 'rank': match.rank + 1, # 'docid': match.docid, # 'data': fields # } # part = [match.rank, match.docid, fields] matches.append(fields) return matches
def get_full_text_matches(database, full_text_query, offset, limit, match_decider): u"""Does the actual full-text search with Xapian. :Parameters: - `database`: the name of the RefDB database - `full_text_query`: the raw query string for the full text search; must not be empty - `offset`: offset of the returned hits within the complete hits list - `limit`: maximal number of returned hits - `match_decider`: Xapian match decider object, e.g. for taking the other search parameters into account :type database: unicode :type full_text_query: unicode :type offset: int :type limit: int :type match_decider: `MatchDecider` :Return: the found matches :rtype: ``Xapian.MSet`` """ database = xapian.Database( os.path.join("/var/lib/django_refdb_indices", database)) enquire = xapian.Enquire(database) # enquire.set_collapse_key(0) query_parser = xapian.QueryParser() stemmer = xapian.Stem("english") query_parser.set_stemmer(stemmer) query_parser.set_database(database) query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = query_parser.parse_query(full_text_query) enquire.set_query(query) return enquire.get_mset(offset, limit, None, match_decider)
def crossquery(querystring, lg): url = 'http://www.galoes.org/grammars/%s/%s' width = 20 d = '/var/wiki' results = [] qp = xapian.QueryParser() database = xapian.Database('/var/wiki/xapian/%s/index' % lg) qp.set_database(database) msc = MoinSearchConnection(database) qresults = msc.get_all_documents(query=qp.parse_query(querystring)) for r in qresults: wikiname = r.data['title'][0] wikinamefs = wikiutil.quoteWikinameFS(wikiname) try: refv = '%s/%s/pages/%s/current' % (d, lg, wikinamefs) revnr = open(refv).read().strip() contentf = '%s/%s/pages/%s/revisions/%s' % (d, lg, wikinamefs, revnr) content = open(contentf).read().decode('utf8') except IOError: print "File not Found", wikinamefs continue matches = re.findall( u'%s%s%s' % ('.' + '{,%i}' % width, querystring.lower(), '.' + '{,%i}' % width), content.lower()) print matches results.append({ 'link': url % (lg, wikinamefs), 'totalmatches': len(matches), 'matches': matches, 'name': wikiname, }) return results
def test_userstem(): mystem = MyStemmer() stem = xapian.Stem(mystem) expect(stem(b'test'), b'tst') stem2 = xapian.Stem(mystem) expect(stem2(b'toastie'), b'tst') indexer = xapian.TermGenerator() indexer.set_stemmer(xapian.Stem(MyStemmer())) doc = xapian.Document() indexer.set_document(doc) indexer.index_text(b'hello world') s = '/' for t in doc.termlist(): s += t.term.decode('utf-8') s += '/' expect(s, '/Zhll/Zwrld/hello/world/') parser = xapian.QueryParser() parser.set_stemmer(xapian.Stem(MyStemmer())) parser.set_stemming_strategy(xapian.QueryParser.STEM_ALL) expect_query(parser.parse_query(b'color television'), '(clr@1 OR tlvsn@2)')
from softwarecenter.enums import XapianValues import softwarecenter.paths # this is not a test as such, more a example of how xapian search # work and useful features around them if __name__ == "__main__": if len(sys.argv) > 1: search_term = sys.argv[1] else: search_term = "app" db = xapian.Database(softwarecenter.paths.XAPIAN_PATH) parser = xapian.QueryParser() #parser.set_stemmer(xapian.Stem("english")) #parser.set_stemming_strategy(xapian.QueryParser.STEM_ALL) parser.set_database(db) #parser.add_prefix("pkg", "AP") query = parser.parse_query( search_term, xapian.QueryParser.FLAG_PARTIAL | xapian.QueryParser.FLAG_WILDCARD) enquire = xapian.Enquire(db) enquire.set_sort_by_value_then_relevance(XapianValues.POPCON) enquire.set_query(query) matches = enquire.get_mset(0, db.get_doccount()) print "Matches:" for m in matches: doc = m.document
def get_query_from_search_entry(search_term): if not search_term: return xapian.Query("") parser = xapian.QueryParser() user_query = parser.parse_query(search_term) return user_query
while index < len(sys.argv): arg = sys.argv[index] index += 1 if arg == '--': # Passed marker, move to parsing relevant docids. break query_string += ' ' query_string += arg # Create an RSet with the listed docids in. reldocs = xapian.RSet() for index in range(index, len(sys.argv)): reldocs.add_document(int(sys.argv[index])) # Parse the query string to produce a Xapian::Query object. qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(query_string) if not query.empty(): print("Parsed query is: %s" % str(query)) # Find the top 10 results for the query. enquire.set_query(query) matches = enquire.get_mset(0, 10, reldocs) # Display the results. print("%i results found." % matches.get_matches_estimated())
def test_all(): # Test the version number reporting functions give plausible results. v = "%d.%d.%d" % (xapian.major_version(), xapian.minor_version(), xapian.revision()) v2 = xapian.version_string() expect(v2, v, "Unexpected version output") # A regexp check would be better, but seems to create a bogus "leak" of -1 # objects in Python 3. expect(len(xapian.__version__.split('.')), 3, 'xapian.__version__ not X.Y.Z') expect((xapian.__version__.split('.'))[0], '1', 'xapian.__version__ not "1.Y.Z"') def access_cvar(): res = xapian.cvar print("Unhandled constants: ", res) return res # Check that SWIG isn't generating cvar (regression test for ticket#297). # # Python 3.5 generates a different exception message here to earlier # versions, so we need a check which matches both. expect_exception(AttributeError, lambda msg: msg.find("has no attribute 'cvar'") != -1, access_cvar) stem = xapian.Stem(b"english") expect(str(stem), "Xapian::Stem(english)", "Unexpected str(stem)") doc = xapian.Document() doc.set_data(b"a\0b") if doc.get_data() == b"a": raise TestFail("get_data+set_data truncates at a zero byte") expect(doc.get_data(), b"a\0b", "get_data+set_data doesn't transparently handle a zero byte") doc.set_data(b"is there anybody out there?") doc.add_term(b"XYzzy") doc.add_posting(stem(b"is"), 1) doc.add_posting(stem(b"there"), 2) doc.add_posting(stem(b"anybody"), 3) doc.add_posting(stem(b"out"), 4) doc.add_posting(stem(b"there"), 5) db = xapian.WritableDatabase('', xapian.DB_BACKEND_INMEMORY) db.add_document(doc) expect(db.get_doccount(), 1, "Unexpected db.get_doccount()") terms = ["smoke", "test", "terms"] expect_query( xapian.Query(xapian.Query.OP_OR, [t.encode('utf-8') for t in terms]), "(smoke OR test OR terms)") query1 = xapian.Query(xapian.Query.OP_PHRASE, (b"smoke", b"test", b"tuple")) query2 = xapian.Query(xapian.Query.OP_XOR, (xapian.Query(b"smoke"), query1, b"string")) expect_query(query1, "(smoke PHRASE 3 test PHRASE 3 tuple)") expect_query( query2, "(smoke XOR (smoke PHRASE 3 test PHRASE 3 tuple) XOR string)") subqs = ["a", "b"] expect_query( xapian.Query(xapian.Query.OP_OR, [s.encode('utf-8') for s in subqs]), "(a OR b)") expect_query(xapian.Query(xapian.Query.OP_VALUE_RANGE, 0, b'1', b'4'), "VALUE_RANGE 0 1 4") # Check database factory functions are wrapped as expected (or not wrapped # in the first cases): expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb")) expect_exception( AttributeError, lambda msg: msg.find("has no attribute 'open_stub'") != -1, lambda: xapian.open_stub(b"nosuchdir/nosuchdb", xapian.DB_OPEN)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database(b"nosuchdir/nosuchdb", xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_OPEN | xapian.DB_BACKEND_STUB)) expect_exception( xapian.DatabaseOpeningError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_GLASS)) expect_exception( xapian.DatabaseCreateError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_GLASS)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.Database( b"nosuchdir/nosuchdb", xapian.DB_BACKEND_CHERT)) expect_exception( xapian.FeatureUnavailableError, None, lambda: xapian.WritableDatabase( b"nosuchdir/nosuchdb", xapian.DB_CREATE | xapian.DB_BACKEND_CHERT)) expect_exception(xapian.NetworkError, None, xapian.remote_open, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"/bin/false", b"") expect_exception(xapian.NetworkError, None, xapian.remote_open, b"127.0.0.1", 0, 1) expect_exception(xapian.NetworkError, None, xapian.remote_open_writable, b"127.0.0.1", 0, 1) # Check wrapping of MatchAll and MatchNothing: expect_query(xapian.Query.MatchAll, "<alldocuments>") expect_query(xapian.Query.MatchNothing, "") # Feature test for Query.__iter__ term_count = 0 for term in query2: term_count += 1 expect(term_count, 4, "Unexpected number of terms in query2") enq = xapian.Enquire(db) enq.set_query(xapian.Query(xapian.Query.OP_OR, b"there", b"is")) mset = enq.get_mset(0, 10) expect(mset.size(), 1, "Unexpected mset.size()") expect(len(mset), 1, "Unexpected mset.size()") # Feature test for Enquire.matching_terms(docid) term_count = 0 for term in enq.matching_terms(mset.get_hit(0)): term_count += 1 expect(term_count, 2, "Unexpected number of matching terms") # Feature test for MSet.__iter__ msize = 0 for match in mset: msize += 1 expect(msize, mset.size(), "Unexpected number of entries in mset") terms = b" ".join(enq.matching_terms(mset.get_hit(0))) expect(terms, b"is there", "Unexpected terms") # Feature test for ESet.__iter__ rset = xapian.RSet() rset.add_document(1) eset = enq.get_eset(10, rset) term_count = 0 for term in eset: term_count += 1 expect(term_count, 3, "Unexpected number of expand terms") # Feature test for Database.__iter__ term_count = 0 for term in db: term_count += 1 expect(term_count, 5, "Unexpected number of terms in db") # Feature test for Database.allterms term_count = 0 for term in db.allterms(): term_count += 1 expect(term_count, 5, "Unexpected number of terms in db.allterms") # Feature test for Database.postlist count = 0 for posting in db.postlist(b"there"): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('there')") # Feature test for Database.postlist with empty term (alldocspostlist) count = 0 for posting in db.postlist(b""): count += 1 expect(count, 1, "Unexpected number of entries in db.postlist('')") # Feature test for Database.termlist count = 0 for term in db.termlist(1): count += 1 expect(count, 5, "Unexpected number of entries in db.termlist(1)") # Feature test for Database.positionlist count = 0 for term in db.positionlist(1, b"there"): count += 1 expect(count, 2, "Unexpected number of entries in db.positionlist(1, 'there')") # Feature test for Document.termlist count = 0 for term in doc.termlist(): count += 1 expect(count, 5, "Unexpected number of entries in doc.termlist()") # Feature test for TermIter.skip_to term = doc.termlist() term.skip_to(b'n') while True: try: x = next(term) except StopIteration: break if x.term < b'n': raise TestFail("TermIter.skip_to didn't skip term '%s'" % x.term.decode('utf-8')) # Feature test for Document.values count = 0 for term in list(doc.values()): count += 1 expect(count, 0, "Unexpected number of entries in doc.values") # Check exception handling for Xapian::DocNotFoundError expect_exception(xapian.DocNotFoundError, "Docid 3 not found", db.get_document, 3) # Check value of OP_ELITE_SET expect(xapian.Query.OP_ELITE_SET, 10, "Unexpected value for OP_ELITE_SET") # Feature test for MatchDecider doc = xapian.Document() doc.set_data(b"Two") doc.add_posting(stem(b"out"), 1) doc.add_posting(stem(b"outside"), 1) doc.add_posting(stem(b"source"), 2) doc.add_value(0, b"yes") db.add_document(doc) class testmatchdecider(xapian.MatchDecider): def __call__(self, doc): return doc.get_value(0) == b"yes" query = xapian.Query(stem(b"out")) enquire = xapian.Enquire(db) enquire.set_query(query) mset = enquire.get_mset(0, 10, None, testmatchdecider()) expect(mset.size(), 1, "Unexpected number of documents returned by match decider") expect(mset.get_docid(0), 2, "MatchDecider mset has wrong docid in") # Feature test for ExpandDecider class testexpanddecider(xapian.ExpandDecider): def __call__(self, term): return (not term.startswith(b'a')) enquire = xapian.Enquire(db) rset = xapian.RSet() rset.add_document(1) eset = enquire.get_eset(10, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, testexpanddecider()) eset_terms = [item.term for item in eset] expect(len(eset_terms), eset.size(), "Unexpected number of terms returned by expand") if [t for t in eset_terms if t.startswith(b'a')]: raise TestFail("ExpandDecider was not used") # Check min_wt argument to get_eset() works (new in 1.2.5). eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ) expect([i.weight for i in eset][-1] < 1.9, True, "test get_eset() without min_wt") eset = enquire.get_eset(100, rset, xapian.Enquire.USE_EXACT_TERMFREQ, 1.0, None, 1.9) expect([i.weight for i in eset][-1] >= 1.9, True, "test get_eset() min_wt") # Check QueryParser parsing error. qp = xapian.QueryParser() expect_exception(xapian.QueryParserError, "Syntax: <expression> AND <expression>", qp.parse_query, b"test AND") # Check QueryParser pure NOT option qp = xapian.QueryParser() expect_query( qp.parse_query(b"NOT test", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT test@1)") # Check QueryParser partial option qp = xapian.QueryParser() qp.set_database(db) qp.set_default_op(xapian.Query.OP_AND) qp.set_stemming_strategy(qp.STEM_SOME) qp.set_stemmer(xapian.Stem(b'en')) expect_query(qp.parse_query(b"foo o", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR o) OR Zo@2))") expect_query(qp.parse_query(b"foo outside", qp.FLAG_PARTIAL), "(Zfoo@1 AND ((SYNONYM WILDCARD OR outside) OR Zoutsid@2))") # Test supplying unicode strings expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar')), '(foo OR bar)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xa3')), '(foo OR bar\\xa3)') expect_query(xapian.Query(xapian.Query.OP_OR, (b'foo', b'bar\xc2\xa3')), '(foo OR bar\u00a3)') expect_query(xapian.Query(xapian.Query.OP_OR, b'foo', b'bar'), '(foo OR bar)') expect_query( qp.parse_query(b"NOT t\xe9st", qp.FLAG_BOOLEAN + qp.FLAG_PURE_NOT), "(<alldocuments> AND_NOT Zt\u00e9st@1)") doc = xapian.Document() doc.set_data(b"Unicode with an acc\xe9nt") doc.add_posting(stem(b"out\xe9r"), 1) expect(doc.get_data(), b"Unicode with an acc\xe9nt") term = next(doc.termlist()).term expect(term, b"out\xe9r") # Check simple stopper stop = xapian.SimpleStopper() qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") stop.add(b'a') expect(stop(b'a'), True) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Feature test for custom Stopper class my_b_stopper(xapian.Stopper): def __call__(self, term): return term == b"b" def get_description(self): return "my_b_stopper" stop = my_b_stopper() expect(stop.get_description(), "my_b_stopper") qp.set_stopper(stop) expect(stop(b'a'), False) expect_query(qp.parse_query(b"foo bar a", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2 AND Za@3)") expect(stop(b'b'), True) expect_query(qp.parse_query(b"foo bar b", qp.FLAG_BOOLEAN), "(Zfoo@1 AND Zbar@2)") # Test TermGenerator termgen = xapian.TermGenerator() doc = xapian.Document() termgen.set_document(doc) termgen.index_text(b'foo bar baz foo') expect([(item.term, item.wdf, [pos for pos in item.positer]) for item in doc.termlist()], [(b'bar', 1, [2]), (b'baz', 1, [3]), (b'foo', 2, [1, 4])]) # Check DateValueRangeProcessor works context("checking that DateValueRangeProcessor works") qp = xapian.QueryParser() vrpdate = xapian.DateValueRangeProcessor(1, 1, 1960) qp.add_valuerangeprocessor(vrpdate) query = qp.parse_query(b'12/03/99..12/04/01') expect(str(query), 'Query(VALUE_RANGE 1 19991203 20011204)') # Regression test for bug#193, fixed in 1.0.3. context("running regression test for bug#193") vrp = xapian.NumberValueRangeProcessor(0, b'$', True) a = '$10' b = '20' slot, a, b = vrp(a, b.encode('utf-8')) expect(slot, 0) expect(xapian.sortable_unserialise(a), 10) expect(xapian.sortable_unserialise(b), 20) # Feature test for xapian.FieldProcessor context("running feature test for xapian.FieldProcessor") class testfieldprocessor(xapian.FieldProcessor): def __call__(self, s): if s == 'spam': raise Exception('already spam') return xapian.Query("spam") qp.add_prefix('spam', testfieldprocessor()) qp.add_boolean_prefix('boolspam', testfieldprocessor()) query = qp.parse_query('spam:ignored') expect(str(query), 'Query(spam)') # FIXME: This doesn't currently work: # expect_exception(Exception, 'already spam', qp.parse_query, 'spam:spam') # Regression tests copied from PHP (probably always worked in python, but # let's check...) context("running regression tests for issues which were found in PHP") # PHP overload resolution involving boolean types failed. enq.set_sort_by_value(1, True) # Regression test - fixed in 0.9.10.1. oqparser = xapian.QueryParser() oquery = oqparser.parse_query(b"I like tea") # Regression test for bug#192 - fixed in 1.0.3. enq.set_cutoff(100) # Test setting and getting metadata expect(db.get_metadata(b'Foo'), b'') db.set_metadata(b'Foo', b'Foo') expect(db.get_metadata(b'Foo'), b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.set_metadata, b'', b'Foo') expect_exception(xapian.InvalidArgumentError, "Empty metadata keys are invalid", db.get_metadata, b'') # Test OP_SCALE_WEIGHT and corresponding constructor expect_query( xapian.Query(xapian.Query.OP_SCALE_WEIGHT, xapian.Query(b'foo'), 5), "5 * foo")