def find_objects(collection, keywords, docs=False): colpath = _get_path_to_collection(collection) if not _dir_exists(colpath): _create_path(colpath) # Collection/search initialisation db = _open_collection(colpath, READ) enq = xapian.Enquire(db) stem = xapian.Stem("dutch") qp = xapian.QueryParser() # Query-parsing qp.set_database(db) qp.set_stemmer(stem) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(keywords) log.debug("Searching collection %r with query: %s", collection, str(query)) enq.set_query(query) matches = enq.get_mset(0, 10000) log.debug("Found approx %d results", matches.get_matches_estimated()) # Return documents if docs: return [m for m in matches] else: return [m.document.get_data() for m in matches]
def index(datapath, dbpath): db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_csv_file(datapath): title = fields.get('TITLE', u'') body = fields.get('BODY', u'') textClass = fields.get('CLASS', u'') identifier = fields.get('ID', u'') print '{}'.format(title) print '{}'.format(body) doc = xapian.Document() termgenerator.set_document(doc) termgenerator.index_text(textClass, 1, 'C') termgenerator.index_text(body, 1, 'B') termgenerator.index_text(identifier, 1, 'I') termgenerator.index_text(textClass) termgenerator.increase_termpos() termgenerator.index_text(body) termgenerator.increase_termpos() termgenerator.index_text(identifier) doc.set_data(json.dumps(fields, ensure_ascii=False, encoding="utf-8")) idterm = u"Q" + identifier doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def searchDoc(queryString): # Parse query string queryParser = xapian.QueryParser() queryParser.set_stemmer(xapian.Stem('english')) queryParser.set_database(database) queryParser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = queryParser.parse_query(queryString) # Set offset and limit for pagination offset, limit = 0, database.get_doccount() # Start query session enquire = xapian.Enquire(database) enquire.set_query(query) # Display matches matches = enquire.get_mset(offset, limit) listToReturn = [] for i, match in enumerate(matches): # print '===================' # print 'rank=%s, documentID=%s' % (match.rank, match.docid) # print '-------------------' # print match.document.get_data() listToReturn.append(match.document.get_data()) if (i > 9): break # # print '===================' # # print 'Number of documents matching query: %s' % matches.get_matches_estimated() # # print 'Number of documents returned: %s' % matches.size() return listToReturn
def __init__(self, path): self.path = path self.term_generator = xapian.TermGenerator() self.term_generator.set_stemmer(xapian.Stem("en")) self.term_generator.set_stemming_strategy(STEMMING) self.query_parser = xapian.QueryParser() self.query_parser.set_stemmer(xapian.Stem("en")) self.query_parser.set_stemming_strategy(STEMMING) for field, prefix in terms.STEMMED: self.query_parser.add_prefix(field, prefix) for field, prefix in terms.PREFIXED_UNSTEMMED: self.query_parser.add_prefix(field, prefix) for field, prefix in terms.PREFIXED_UNSTEMMED_BOOLEAN: self.query_parser.add_boolean_prefix(field, prefix)
def create_index(self): """ Create a new index, and set up its field structure """ log.warning("start create_index") self.db = xapian.WritableDatabase(self.dbpath, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() self.indexer.set_stemmer(xapian.Stem("en")) log.warning("end create_index")
def _parseq(x_db, query, prefix=''): '''parse and return a QueryParser query''' qp = _x.QueryParser() stemmer = _x.Stem("english") qp.set_stemmer(stemmer) qp.set_database(x_db) qp.set_stemming_strategy(_x.QueryParser.STEM_SOME) return qp.parse_query(query, 0, prefix)
def __init__(self, language_code='en', stemmer=None): """Create a new highlighter for the specified language. """ if stemmer is not None: self.stem = stemmer else: self.stem = xapian.Stem(language_code)
def _parse_query(self, text): database = self._index.index # Parse the query string to produce a Xapian::Query object. self.parser = xapian.QueryParser() self.parser.set_stemmer(xapian.Stem("english")) self.parser.set_database(database) self.parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME) self.query = self.parser.parse_query(text, DEFAULT_SEARCH_FLAGS)
def index(datapath, dbpath): # Create or open the database we're going to be writing to. db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_states(datapath): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. name = fields.get('name', u'') description = fields.get('description', u'') motto = fields.get('motto', u'') admitted = fields.get('admitted', None) population = fields.get('population', None) order = fields.get('order', u'') # We make a document and tell the term generator to use this. doc = xapian.Document() termgenerator.set_document(doc) # index each field with a suitable prefix termgenerator.index_text(name, 1, 'S') termgenerator.index_text(description, 1, 'XD') termgenerator.index_text(motto, 1, 'XM') # Index fields without prefixes for general search. termgenerator.index_text(name) termgenerator.increase_termpos() termgenerator.index_text(description) termgenerator.increase_termpos() termgenerator.index_text(motto) # Add document values. if admitted is not None: doc.add_value(1, xapian.sortable_serialise(int(admitted[:4]))) doc.add_value(2, admitted) # YYYYMMDD if population is not None: doc.add_value(3, xapian.sortable_serialise(int(population))) ### Start of example code. midlat = fields['midlat'] midlon = fields['midlon'] if midlat and midlon: doc.add_value(4, "%f,%f" % (float(midlat), float(midlon))) ### End of example code. # Store all the fields for display purposes. doc.set_data(json.dumps(fields)) # We use the order to ensure each object ends up in the # database only once no matter how many times we run the # indexer. idterm = u"Q" + order doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def query_parser(x_db): qp = _x.QueryParser() stemmer = _x.Stem("english") qp.set_stemmer(stemmer) qp.set_database(x_db) qp.set_stemming_strategy(_x.QueryParser.STEM_SOME) for k, v in TERM_MAP.iteritems(): qp.add_prefix(k, v) return qp
def hello_world(request): # Open the database for searching. database = xapian.Database( "/home/pb/Documents/python/django_test/mysite/firsttest/index") # Start an enquire session. enquire = xapian.Enquire(database) # Combine the rest of the command line arguments with spaces between # them, so that simple queries don't have to be quoted at the shell # level. #second parameter is the search key word #str_key = request.session['key_word'] str_key = request.session.get('key_word') query_string = str.join(' ', str(str_key)) # Parse the query string to produce a Xapian::Query object. qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(query_string) print "Parsed query is: %s" % str(query) # Find the top 10 results for the query. enquire.set_query(query) matches = enquire.get_mset(0, 10) # Display the results. print "%i results found." % matches.get_matches_estimated() result_found = matches.get_matches_estimated() print "Results 1-%i:" % matches.size() #remove all the data in the database first, and then displace them search_result1 = search_result.objects.all() search_result1.delete() i = 0 for m in matches: i = i + 1 #generate the filename temp_str = str(m.document.get_data()).split(" sample=") temp = temp_str[0] file_name = temp.strip("url=/") print "result file name ::: " + file_name[0:4].strip("\ns") file_name1 = file_name[0:4].strip("\ns") #create object search_result.objects.create(result_id=i, total_result_number=str(result_found), match_rate=str(m.percent), doc_id=str(file_name1), content=str(m.document.get_data())) #the file name is store in url return render(request, 'hello_world.html', {'result': (search_result.objects.all())})
def make_tg(): termgen = xapian.TermGenerator() termgen.set_stemmer(xapian.Stem('en')) stopper = xapian.SimpleStopper() stopper.add('to') stopper.add('not') termgen.set_stopper(stopper) del stopper return termgen
def Do_Index(self, callback=None): """loop through all feeds and entries and feed them to the beast""" def index_interrupt(): self._indexing = False self._index_lock.release() if callback is not None: callback() self._interrupt() return if not self._index_lock.acquire(False): logging.info("already indexing, not trying to reindex again") return self._indexing = True db = self._get_db() c = db.cursor() #remove existing DB utils.deltree(self._storeDir) database = xapian.WritableDatabase(self._storeDir, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) c.execute(u"""SELECT id, title, description FROM feeds""") feeds = c.fetchall() c.execute(u"""SELECT id, feed_id, title, description,fakedate FROM entries ORDER BY fakedate""") entries = c.fetchall() c.close() db.close() logging.info("indexing feeds") def feed_index_generator(feeds): for feed_id, title, description in feeds: try: doc = xapian.Document() forindex = title+" "+description #eh? we can only remove docs by term, but we can only #get values. so we need both it seems doc.add_term("f"+str(feed_id)) doc.add_value(FEED_ID, str(feed_id)) doc.add_value(DATE, "") doc.set_data(forindex) indexer.set_document(doc) indexer.index_text(forindex) #database.add_document(doc) yield doc except Exception, e: logging.error("Failed in indexDocs, feeds: %s" % str(e))
def Search(self, command, blacklist=[], include=['feeds','entries'], since=0): """returns two lists, one of search results in feeds, and one for results in entries. It is sorted so that title results are first, description results are second""" if not self._index_lock.acquire(False): #if we are indexing, don't try to search #print "wouldn't get lock" return ([],[]) self._index_lock.release() database = xapian.Database(self._storeDir) enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) enquire.set_docid_order(xapian.Enquire.DESCENDING) enquire.set_weighting_scheme(xapian.BoolWeight()) # Display the results. #print "%i results found." % matches.get_matches_estimated() #print "Results 1-%i:" % matches.size() #for m in matches: # print "%i: %i%% docid=%i [%s] %s %s %s" % (m.rank + 1, m.percent, m.docid, m.document.get_data()[0:100], m.document.get_value(0), m.document.get_value(1), m.document.get_value(2)) feed_results=[] entry_results=[] query = qp.parse_query(command) enquire.set_query(query) matches = enquire.get_mset(0, 100) for m in matches: doc = m.document feed_id = doc.get_value(FEED_ID) feed_id = int(feed_id) try: if feed_id not in blacklist: entry_id = doc.get_value(ENTRY_ID) if entry_id is '': # meaning this is actually a feed (we could know that from above, but eh) feed_results.append(int(feed_id)) else: # meaning "entry" title = doc.get_value(ENTRY_TITLE) fakedate = float(doc.get_value(DATE)) / 1000.0 if fakedate > since: entry_results.append((int(entry_id),title, fakedate, feed_id)) #else: # print "excluding:"+doc.get("title") except Exception, e: print e print feed_id print blacklist
def search(dbpath, querystring, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # Start of example code. class DistanceKeyMaker(xapian.KeyMaker): def __call__(self, doc): # we want to return a sortable string which represents # the distance from Washington, DC to the middle of this # state. value = doc.get_value(4).decode('utf8') x, y = map(float, value.split(',')) washington = (38.012, -77.037) return xapian.sortable_serialise( support.distance_between_coords((x, y), washington)) enquire.set_sort_by_key_then_relevance(DistanceKeyMaker(), False) # End of example code. # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data().decode('utf8')) print( u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'name': fields.get('name', u''), 'date': support.format_date(fields.get('admitted', u'')), 'pop': support.format_numeral(int(fields.get('population', 0))), 'lat': fields.get('latitude', u''), 'lon': fields.get('longitude', u''), }) matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def phrase(terms, prefix=None, language=None, window=3): if isinstance(terms, basestring): terms = terms.split() if prefix: terms = ['%s:%s' % (prefix, t) for t in terms] if language: stem = xapian.Stem(language) terms = ['Z%s' % stem(t) for t in terms] print terms return Query(Query.OP_PHRASE, terms, window)
def __init__(self, database_dir): """Initialize the searcher. """ index_dir = os.path.join(database_dir, IndexProcessor.INDEX_DIR) self._db = xapian.Database(index_dir) # Create the stemmers. self._stemmers = [] for lang in IndexProcessor.STEM_LANGS: stemmer = xapian.Stem(lang) self._stemmers.append(stemmer)
def __init__(self, dbpath, *, cjk=False): """Initialize indexer with dbpath.""" self._db = None self.dbpath = dbpath self.term_generator = xapian.TermGenerator() self.term_generator.set_stemmer(xapian.Stem("en")) if cjk: self.term_generator.set_flags(self.term_generator.FLAG_CJK_NGRAM) logger.info("FLAG_CJK_NGRAM enabled") self.open()
def __init__(self, db_path): try: # Open the database for update, creating a new database if necessary. self.database = xapian.WritableDatabase(db_path, xapian.DB_CREATE_OR_OPEN) self.indexer = xapian.TermGenerator() self.stemmer = xapian.Stem("english") # XXX self.indexer.set_stemmer(self.stemmer) except: raise
def search(dbpath, querystring, materials, offset=0, pagesize=10): # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) ### Start of example code. # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") # And parse the query query = queryparser.parse_query(querystring) if len(materials) > 0: # Filter the results to ones which contain at least one of the # materials. # Build a query for each material value material_queries = [ xapian.Query('XM' + material.lower()) for material in materials ] # Combine these queries with an OR operator material_query = xapian.Query(xapian.Query.OP_OR, material_queries) # Use the material query to filter the main query query = xapian.Query(xapian.Query.OP_FILTER, query, material_query) ### End of example code. # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) print( u"%(rank)i: #%(docid)3.3i %(title)s" % { 'rank': match.rank + 1, 'docid': match.docid, 'title': fields.get('TITLE', u''), }) matches.append(match.docid) # Finally, make sure we log the query and displayed results support.log_matches(querystring, offset, pagesize, matches)
def search(self, query_string, offset=0, numresults=10): database = self.get_database() enquire = xapian.Enquire(database) qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) query = qp.parse_query(query_string) enquire.set_query(query) matches = enquire.get_mset(offset, numresults) return matches
def query_fact_normally(fact, k_matches): queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.add_prefix("text", "") queryparser.add_prefix("title", "S") query = queryparser.parse_query(my_own_queryparser(fact)) enquire = xapian.Enquire(db) # enquire.set_weighting_scheme(xapian.TfIdfWeight()) enquire.set_query(query) return list(match.document.get_data().decode("utf8") for match in enquire.get_mset(0, k_matches))
def index(datapath, dbpath): # Create or open the database we're going to be writing to. db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) # Set up a TermGenerator that we'll use in indexing. termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) for fields in parse_csv_file(datapath): # 'fields' is a dictionary mapping from field name to value. # Pick out the fields we're going to index. description = fields.get('DESCRIPTION', u'') title = fields.get('TITLE', u'') identifier = fields.get('id_NUMBER', u'') # We make a document and tell the term generator to use this. doc = xapian.Document() termgenerator.set_document(doc) # Index each field with a suitable prefix. termgenerator.index_text(title, 1, 'S') termgenerator.index_text(description, 1, 'XD') # Index fields without prefixes for general search. termgenerator.index_text(title) termgenerator.increase_termpos() termgenerator.index_text(description) # Store all the fields for display purposes. doc.set_data(json.dumps(fields, encoding='utf8')) ### Start of example code. # parse the two values we need measurements = fields.get('MEASUREMENTS', u'') if measurements != u'': numbers = numbers_from_string(measurements) if len(numbers) > 0: doc.add_value(0, xapian.sortable_serialise(max(numbers))) date_made = fields.get('DATE_MADE', u'') years = numbers_from_string(date_made) if len(years) > 0: doc.add_value(1, xapian.sortable_serialise(years[0])) ### End of example code. # We use the identifier to ensure each object ends up in the # database only once no matter how many times we run the # indexer. idterm = u"Q" + identifier doc.add_boolean_term(idterm) db.replace_document(idterm, doc)
def reload_database(self): # {{{ # create the xapian handlers self.db = xapian.WritableDatabase(self.database, xapian.DB_CREATE_OR_OPEN) self.qp = xapian.QueryParser() self.qp.set_stemmer(xapian.Stem(self.language)) self.qp.set_stemming_strategy(self.qp.STEM_SOME) self.qp.add_prefix("title", "S") self.tg = xapian.TermGenerator() self.tg.set_stemmer(xapian.Stem(self.language)) try: self.tg.set_stemming_strategy(self.tg.STEM_SOME) except AttributeError: pass self.e = xapian.Enquire(self.db) self.sorted_e = xapian.Enquire(self.db) # Value 2 is the lowercase form of the title self.sorted_e.set_sort_by_value(2, False)
def __init__(self): try: self.extract = utilsXapian.Extract() self.indexer = xapian.TermGenerator() stemmer = xapian.Stem("french") self.indexer.set_stemmer(stemmer) self.fichier = "" self.database = xapian.WritableDatabase() except Exception, e: print >> sys.stderr, "Exception: %s" % str(e) sys.exit(1)
def __init__(self, tfidf_path, strict=True): self.db = xapian.Database(tfidf_path) queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) # Start of prefix configuration. queryparser.add_prefix("title", "S") queryparser.add_prefix("description", "XD") self.queryparser = queryparser self.enquire = xapian.Enquire(self.db)
def xapian_init_databases(): """ Initializes all database objects. """ xapian_ensure_db_dir(XAPIAN_DIR_NAME) for field in INDEXES: xapian_ensure_db_dir(XAPIAN_DIR_NAME + "/" + field) database = xapian.WritableDatabase(XAPIAN_DIR + "/" + field, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() stemmer = xapian.Stem("english") indexer.set_stemmer(stemmer) DATABASES[field] = (database, indexer)
def search(dbpath, title, offset=0, pagesize=10): print '{}'.format(title) regex = re.findall(r'\w+', title.lower()) print 'REGEX: {}'.format(regex) queryAND = ' AND '.join(regex) queryOR = ' OR '.join(regex) for querystring in [queryAND.encode('utf-8'), queryOR.encode('utf-8')]: print 'QUERY: {}'.format(querystring) # offset - defines starting point within result set # pagesize - defines number of records to retrieve # Open the database we're going to search. db = xapian.Database(dbpath) # Set up a QueryParser with a stemmer and suitable prefixes queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) queryparser.set_stemming_strategy(queryparser.STEM_SOME) # Start of prefix configuration. queryparser.add_prefix("TITLE", "T") queryparser.add_prefix("BODY", "B") queryparser.add_prefix("ID", "I") queryparser.add_prefix("CLASS", "C") # End of prefix configuration. # And parse the query query = queryparser.parse_query(querystring) # Use an Enquire object on the database to run the query enquire = xapian.Enquire(db) enquire.set_query(query) # And print out something about each match matches = [] for match in enquire.get_mset(offset, pagesize): fields = json.loads(match.document.get_data()) # if title == fields.get('TITLE', u''): # print 'This is \n' print( u"%(rank)i: #%(docid)3.3i \n %(id)s \n %(title)s \n %(body)s \n %(class)s \n" % { 'rank': match.rank + 1, 'docid': match.docid, 'id': fields.get('ID', u''), 'title': fields.get('TITLE', u''), 'body': fields.get('BODY', u''), 'class': fields.get('CLASS', u'') }) matches.append(match.docid) # Finally, make sure we log the query and displayed results log_matches(querystring, offset, pagesize, matches)
def search(request): searchdb = settings.XAPIAN_DB database = xapian.Database(searchdb) enquire = xapian.Enquire(database) if request.GET.get('s'): query_string = request.GET.get('s') else: query_string = '' response = HttpResponse(mimetype='text/html') qp = xapian.QueryParser() stemmer = xapian.Stem("english") qp.set_stemmer(stemmer) qp.set_database(database) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) qp.set_default_op(xapian.Query.OP_AND) qp.add_boolean_prefix('speaker', 'S') qp.add_boolean_prefix('major', 'M') qp.add_boolean_prefix('date', 'D') qp.add_boolean_prefix('batch', 'B') qp.add_boolean_prefix('segment', 'U') qp.add_boolean_prefix('department', 'G') qp.add_boolean_prefix('party', 'P') qp.add_boolean_prefix('column', 'C') qp.add_boolean_prefix('gid', 'Q') query = qp.parse_query(query_string) enquire.set_query(query) matches = enquire.get_mset(0, 20) class Hit: pass resultset = [] for m in matches: current = Hit() current.rank = m.rank + 1 current.docid = m.docid current.path = m.document.get_data() current.percent = m.percent resultset.append(current) t = loader.get_template('parliament/search.html') c = Context({ 'resultcount': matches.get_matches_estimated(), 'results': resultset, }) response.write(t.render(c)) return response
def _init_xapian_search(): global Xapian_Enquires Xapian_Enquires = {} for model_name, model in inspect.getmembers(models, inspect.isclass): db_dir = os.path.join(XAPIAN_INDICES_DIR, model_name) if not os.path.exists(db_dir): continue db = xapian.Database(db_dir) enquire = xapian.Enquire(db) qp = xapian.QueryParser() qp.set_stemmer(xapian.Stem('english')) qp.set_database(db) qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) Xapian_Enquires[model] = (db, enquire, qp)