def get_index(): try: storage = FileStorage(settings.WHOOSH_INDEX) return storage.open_index(indexname="search") except IOError: # No index? other error? create_index() storage = FileStorage(settings.WHOOSH_INDEX) return storage.open_index(indexname="search")
def update_index(sender, instance, created, **kwargs): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() try: writer = ix.writer() except: return tags = [] for t in instance.tags.all(): try: tags.append(unicode(t.name)) except: pass tags = u','.join(tags) try: if created: writer.add_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username, id=unicode(instance.pk)) writer.commit() else: writer.update_document(title=instance.title, content=instance.content,tags=tags,author=instance.author.get_profile().name+u"\n"+instance.author.username, id=unicode(instance.pk)) writer.commit() except: pass
def update_index(sender, instance, created, **kwargs): if int(os.environ.get('SKIP_SEARCH_INDEX', '0')): return try: url = unicode(instance.get_absolute_url()) except Exception: log.critical('Cant resolve url. Content %r not indexed' % instance) return content = getattr(instance, 'content', None) if content is None: content = unicode(instance) elif callable(content): content = content() storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname='memopol') writer = ix.writer() if created: writer.add_document(title=unicode(instance), content=content, type=unicode(instance.__class__.__name__.lower()), url=url) writer.commit() else: writer.update_document(title=unicode(instance), content=content, type=unicode(instance.__class__.__name__.lower()), url=url) writer.commit()
def init_index(index=".index"): indexZ=index if not os.path.exists(indexZ): os.mkdir(indexZ) # os.rmdir(index) storage = FileStorage(indexZ) schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID (stored=True),tags=KEYWORD) ix = storage.create_index(schema) ix = storage.open_index() return ix
def get_index(index=".index"): indexZ=index if not os.path.exists(indexZ): return "there is no index with this name %s!! use indexer to build the index" % index sys.exit() storage = FileStorage(indexZ) ix = storage.open_index() print "the index has %d docs" % ix.doc_count_all() return ix
def handle_noargs(self, **options): # from settings import HAYSTACK_CONNECTIONS # storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH']) storage = FileStorage('/dev/shm/whoosh/') ix = storage.open_index('SPELL') with ix.reader() as r: for id in r.all_doc_ids(): print r.stored_fields(id)
def _open_indexes(self): """open storage and open indexes""" if not os.path.exists("index"): os.mkdir("index") storage = FileStorage("index") # open or initialise index if not storage.index_exists(indexname='MAIN'): self.ix = storage.\ create_index(IndexerSchema, indexname='MAIN') self.ix = storage.open_index(indexname='MAIN')
def eval_get_ranked_set_baseline(self, basefile): # Step 1: Read the saved keyterms for a subset of articles # (created by analyze_baseline_queries) g = Graph() g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3") articles = {} for (s, p, o) in g: if not str(s) in articles: articles[str(s)] = [] articles[str(s)].append(str(o)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, create a query for each article based on # the keyterms. connector = query.Or indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index']) storage = FileStorage(indexdir) idx = storage.open_index() searcher = idx.searcher(weighting=scoring.BM25F()) res = {} # for article in sorted(articles.keys()): for article in self._articles(basefile): terms = articles[article] rankedset = [] #parser = qparser.QueryParser("content", idx.schema) #q = parser.parse(connector.join(terms)) q = query.And([ # query.Term("articles", article), connector([query.Term("content", x) for x in terms]) ]) # print q # self.log.debug("Article %s: %s", article, " or ".join(terms)) results = searcher.search(q, limit=None) resultidx = 0 # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10)) for result in results: reslbl = "%s (%s)" % ( result['basefile'], results.score(resultidx)) rankedset.append([result['basefile'], reslbl]) # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx))) resultidx += 1 self.log.info("Created baseline ranked set for %s: Top result %s (of %s)" % (article.split("/")[-1], rankedset[0][0], len(rankedset))) # return just a list of URIs, no scoring information. But the # full URI isnt available in the whoosh db, so we recreate it. res[article] = ["http://lagen.nu/ext/celex/%s" % x[ 0] for x in rankedset] return res
def update_index(sender, instance, created, **kwargs): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname="rarog") writer = ix.writer() if created: writer.add_document(title=unicode(instance), body_html=instance.body_html, url=unicode(instance.get_absolute_url())) writer.commit() else: writer.update_document(title=unicode(instance), body_html=instance.body_html, url=unicode(instance.get_absolute_url())) writer.commit()
def searchIndex(): ''' searchindex() Performs the requested search through the index/schema INPUTS: idx -- desired index to search OUTPUTS: results -- results of the search ''' # Navigate to the LM index directory c = '' while True: print 'The current directory is ' + os.getcwd() ques = 'Is the LM index (directory) in the current directory? [y/n]\t' c = raw_input(ques).lower() if c == 'y' or c == 'yes': idxDir = os.getcwd() break elif c == 'n' or c == 'no': while True: idxDir = raw_input('Where is it?\t').lower() try: os.chdir(idxDir) break except WindowsError: print 'Sorry, I couldn\'t navigate to that directory' break elif c == 'q' or c == 'quit': print '\tReturning to the Main Menu' return else: print 'I\'m sorry, I don\'t understand what you mean. Try again.' # Open the index idxDir = idxDir + '/LM_Storage' storage = FileStorage(idxDir) idx = storage.open_index(indexname = 'LM') # Determine what the user wants to search for c = '' while True: ques = 'What would you like to search? song/artist [s], lyrics [L]\t' c = raw_input(ques).lower() if c == 's' or c == 'song/artist' or c == 'song': searchForSong(idx) break elif c == 'l' or c == 'lyrics': searchForLyrics(idx) break elif c == 'q' or c == 'quit': print '\tReturning to the Main Menu' return else: print 'I\'m sorry, I don\'t understand what you mean. Try again.'
def search_does_exist(query): #query = unicode(query, 'utf-8') #query = unidecode(query) storage = FileStorage("indexdir") ix = storage.open_index(indexname="wiki") from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("title", ix.schema).parse(query) whoosh_results = searcher.search(query, limit=1) return len(whoosh_results) > 0
def search(self,q): from whoosh.filedb.filestore import FileStorage from whoosh.qparser import MultifieldParser storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() q = q.replace('+', ' AND ').replace(' -', ' NOT ') parser = MultifieldParser(["content","title","tags","author"], schema=ix.schema) qry = parser.parse(q) searcher = ix.searcher() hits = searcher.search(qry) return self.objects.filter(id__in=[h.fields()['id'] for h in hits]).filter(published=True)
def run_search(query): from settings import HAYSTACK_CONNECTIONS storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH']) # storage = FileStorage('/dev/shm/whoosh/') ix = storage.open_index('MAIN') with ix.searcher() as s: from whoosh.qparser import QueryParser qp = QueryParser("text", schema=ix.schema) q = qp.parse(query) results = s.search(q) for i, r in enumerate(results): result = "%d: (%s) %s" % (i, r['id'], r['title']) # ignored
def build_clean_index(): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index() writer = ix.writer() try: mlogger.debug("building index from scratch.....................") mlogger.debug("adding objects...................") for si in StudentInstitute.objects.all(): adddoc(si,writer,True) for fi in FacultyInstitute.objects.all(): adddoc(fi,writer,True) finally: writer.commit() ix.close()
def search(request): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname="rarog") hits = [] query = request.GET.get('q', None) if query is not None and query != u"": query = query.replace('+', ' AND ').replace(' -', ' NOT ') parser = MultifieldParser(['title','body_html'], schema=ix.schema) try: qry = parser.parse(query) except: qry = None if qry is not None: searcher = ix.searcher() hits = searcher.search(qry) return query, hits
def add_documents_to_index(index_name, documents): storage = FileStorage("indexdir") ix = storage.open_index(indexname=index_name) writer = ix.writer() for i, document in enumerate(documents): print "{}%".format(i/len(documents) * 100) if index_name == "wiki": writer.add_document(title=u"{}".format(sanitize_text(document.title))) if index_name == "movie": writer.add_document(title=u"{}".format(sanitize_text(document.title))) writer.commit()
def search(query): #query = unicode(query, 'utf-8') storage = FileStorage("indexdir") ix = storage.open_index(indexname="wiki") from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("title", ix.schema).parse(query) whoosh_results = searcher.search(query, limit=1) results = [] for w in whoosh_results: results.append("{}".format(w)) return results
def _setup(self, storage_directory): schema = fields.Schema( oid=fields.ID(stored=True, unique=True), name=fields.ID()) schema.add('*', fields.TEXT, glob=True) if storage_directory: if os.path.exists(storage_directory): self._using_existing_index = True storage = FileStorage(storage_directory) ix = storage.open_index() else: os.mkdir(storage_directory) storage = FileStorage(storage_directory) ix = storage.create_index(schema) else: storage = RamStorage() ix = storage.create_index(schema) return (schema, ix)
def search(request, template_name='search.html'): """ Simple search view, which accepts search queries via url, like google. Use something like ?q=this+is+the+serch+term """ hits = [] data = dict( q = request.GET.get('q', ''), types = request.GET.getlist('types') or [], ) form = SearchForm(data) form.is_valid() query = form.cleaned_data['q'] types = form.cleaned_data['types'] limit = int(request.GET.get('limit', 0)) or None if query not in (None, u"", u"*"): storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.open_index(indexname='memopol') # Whoosh don't understands '+' or '-' but we can replace # them with 'AND' and 'NOT'. query = query.replace('+', ' AND ').replace(' -', ' NOT ') if types and len(types) != len(types_choices): query = 'type:(%s) %s' % (' OR '.join(types), query) parser = QueryParser("content", schema=ix.schema) try: qry = parser.parse(query) except: # don't show the user weird errors only because we don't # understand the query. # parser.parse("") would return None qry = None if qry is not None: searcher = ix.searcher() try: hits = searcher.search(qry) except Exception, e: log.critical('Error while searching %s' % qry) log.exception(e) else: if limit: hits = hits[:limit] ix.close()
def open_dir(dirname, indexname = None, mapped=True): """Convenience function for opening an index in a directory. Takes care of creating a FileStorage object for you. dirname is the filename of the directory in containing the index. indexname is the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param dirname: the path string of the directory in which to create the index. :param indexname: the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param mapped: whether to use memory mapping to speed up disk reading. :returns: :class:`Index` """ if indexname is None: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname, mapped=mapped) return storage.open_index(indexname)
def open_dir(dirname, indexname=None, readonly=False): """Convenience function for opening an index in a directory. Takes care of creating a FileStorage object for you. dirname is the filename of the directory in containing the index. indexname is the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. :param dirname: the path string of the directory in which to create the index. :param indexname: the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. """ if indexname is None: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname, readonly=readonly) return storage.open_index(indexname)
def newIndex(): ''' newIndex() Creates the index/schema for the Whoosh module INPUTS: (none) OUTPUTS: idx -- index ''' print '\tCreating a new Index in the current directory' # Create an index to store the artist/title and lyrics schm = Schema(Name=TEXT(stored=True), Ingr=KEYWORD(stored=True, commas=True)) # Create a directory called FAR_Storage; will contain the index # See Whoosh documentation for more information if not os.path.exists('FAR_Storage'): os.mkdir('FAR_Storage') idxDir ='FAR_Storage' storage = FileStorage(idxDir) idx = storage.create_index(schm, indexname='FAR') idx = storage.open_index(indexname = 'FAR') return idx
def generateSchedule(): while True: print 'The current directory is ' + os.getcwd() ques = 'Is the FAR index (directory) in the current directory? [y/n]\t' c = raw_input(ques).lower() if c == 'y' or c == 'yes': idxDir = os.getcwd() break elif c == 'n' or c == 'no': while True: idxDir = raw_input('Where is it?\t').lower() try: os.chdir(idxDir) break # Im sure this only works in Windoze except WindowsError: print 'Sorry, I couldn\'t navigate to that directory' break elif c == 'q' or c == 'quit': print '\tReturning to the Main Menu' return else: print 'I\'m sorry, I don\'t understand what you mean. Try again.' # Open the index print '\nThe Food \'Roo is gathering your recipes. Please wait paitiently ...' idxDir = idxDir + '/FAR_Storage' storage = FileStorage(idxDir) idx = storage.open_index(indexname = 'FAR') result = idx.searcher().documents() r = result print 'Yay! We\'ve got a whole pouch full of recipies' print 'You\'ve got ' + str(len(list(r))) + ' recipies in Food \'Roo\'s pouch\n\n' for item in result: print 'I can get in here' print item print item['Name']
def handle_noargs(self, **options): from settings import HAYSTACK_CONNECTIONS storage = FileStorage(HAYSTACK_CONNECTIONS['default']['PATH']) # storage = FileStorage('/dev/shm/whoosh/') ix = storage.open_index('MAIN') with ix.searcher() as s: from whoosh.qparser import QueryParser qp = QueryParser("content", schema=ix.schema) q = qp.parse(u"((title:whee OR text:whee OR notes:whee OR program:whee OR job_title:whee) AND document_type:5 AND (programs:1 OR programs:2))") results = s.search(q) for i, r in enumerate(results): print "%d: (%s) %s" % (i, r['id'], r['title']) q = qp.parse(u"((title:whee OR text:whee OR notes:whee OR program:whee OR job_title:whee) AND document_type:5 AND (programs:1 OR programs:2))") results = s.search(q, sortedby='-created') for i, r in enumerate(results): print "%d: (%s) %s" % (i, r['id'], r['title'])
def whoosh_open_dir_32_or_64(dirname, indexname=None, readonly=False): """Convenience function for opening an index in a directory based on open_dir in whoosh.index. This functions automatically detects if the machine is 32-bit or 64-bit, and turns off mmap if only 32-bit to avoid address space exhaustion on large indices. :param dirname: the path string of the directory in which to create the index. :param indexname: the name of the index to create; you only need to specify this if you have multiple indexes within the same storage object. """ from whoosh.filedb.filestore import FileStorage from whoosh.index import _DEF_INDEX_NAME supports_mmap = not is32bit() if indexname is None: indexname = _DEF_INDEX_NAME storage = FileStorage(dirname, readonly=readonly, supports_mmap=supports_mmap) return storage.open_index(indexname)
class TinaIndex(): """ Open or Create a whoosh index Provides searching methods """ def __init__( self, indexdir ): self.writer = None self.reader = None self.searcher = None self.indexdir = indexdir self.storage = FileStorage(self.indexdir) self.index = None try: self.index = self.storage.open_index() except EmptyIndexError, e: _logger.warning( "No existing index at %s : "%self.indexdir) self.schema = TinaSchema() if not os.path.exists(self.indexdir): os.mkdir(self.indexdir) self.index = self.storage.create_index(self.schema) except LockError, le: _logger.error("index LockError %s : "%self.indexdir) raise LockError(le)
ix_path = 'indexdir/' ix_name = 'luxun_index_name' if not os.path.exists(ix_path): os.mkdir(ix_path) ix = create_in(ix_path, schema, indexname=ix_name) ix.close() from whoosh.filedb.filestore import FileStorage import re # Replace pattern rpp = re.compile('[\u3000※·\n\r〔〕 ]') storage = FileStorage(ix_path) #idx_path 为索引路径 ix = storage.open_index(indexname=ix_name) # 按照schema定义信息,增加需要建立索引的文档 # 注意:字符串格式需要为unicode格式 with ix.writer() as w: for article in articles: with open(article, 'r', encoding='utf-8') as f: arr = f.read() subs = re.split(rpp, arr) art = ''.join(subs) length = len(art) ind = art[::-1].find('〔1〕') if ind > 0: art = art[:length - ind - 1] art = re.split('[\d]', art) art = ''.join(art[1:]) name_arr = article.split('/')
def search(request): if request.method != 'POST': return JsonResponse({'error': 'Method not allowed'}, status=405) if request.POST['searchType'] == 'keywords': if not request.POST['keywords']: return JsonResponse({'error': 'Bad request'}, status=400) result = { 'detections': 0, 'locations': [], 'scores': [], 'categories': [], 'similarImages': [], 'taxanomy': [], 'benefits': [], 'extra': [], 'queryType': 'keywords', } schema = Schema(title=TEXT(stored=True), cat_id=ID(stored=True), content=TEXT) storage = FileStorage(f'{BASE_DIR}/TheApp/flowers/indexdir') ix = storage.open_index(schema=schema) with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse(request.POST['keywords']) results = searcher.search(query) for r in results: if 'cat_id' in r: cid = int(r['cat_id']) result['categories'].append(cid) result['detections'] = len(set(result['categories'])) result['taxanomy'], result['benefits'], result['extra'], result[ 'similarImages'] = get_flower_info_and_images(result['categories']) return JsonResponse(result, safe=True) elif request.POST['searchType'] == 'image': if not request.FILES['image']: return JsonResponse({'error': 'Bad request'}, status=400) # Run inference on the uploaded image. image = Image.open(request.FILES.get('image')) image = image.resize((300, 300)).convert('RGB') image = np.asarray(image, dtype=np.uint8) # Load TFLite model and allocate tensors. interpreter = tf.lite.Interpreter( model_path=f"{BASE_DIR}/TheApp/flowers/ML-Models/model.tflite") interpreter.allocate_tensors() # Get input and output tensors. input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() input_shape = input_details[0]['shape'] input_data = np.reshape(image, input_shape) interpreter.set_tensor(input_details[0]['index'], input_data) interpreter.invoke() locations = interpreter.get_tensor(output_details[0]['index']) categories = interpreter.get_tensor(output_details[1]['index']) scores = interpreter.get_tensor(output_details[2]['index']) detections = interpreter.get_tensor(output_details[3]['index']) result = { 'detections': 0, 'locations': [], 'scores': [], 'categories': [], 'similarImages': [], 'taxanomy': [], 'benefits': [], 'extra': [], 'queryType': 'image', } for d in range(int(detections[0])): score = scores[0][d] if score >= 0.25: loc = locations[0][d] category = categories[0][d] result['detections'] = result['detections'] + 1 result['locations'].append(loc) result['scores'].append(score) result['categories'].append(category) result['taxanomy'], result['benefits'], result['extra'], result[ 'similarImages'] = get_flower_info_and_images(result['categories']) return JsonResponse(result, encoder=NumpyJSONEncoder, safe=True) else: return JsonResponse({'error': 'Bad request'}, status=400)
# Indica onde fica o indice storage = FileStorage(indexdir) # Cria schema schema = Schema( id=ID(stored=True, unique=True), titulo=TEXT(stored=True), conteudo=TEXT(stored=True), tags=KEYWORD ) # Verifica se existe o indice chamado docs na pasta indexdir usages_exists = index.exists_in(indexdir, indexname="docs2") if (usages_exists is True): # Abre o indice gerado ix = storage.open_index(indexname="docs2") else: # Cria novo indice ix = storage.create_index(schema, indexname="docs2") def carregarTxt(): # Passar por todos os arquivos txt da pasta docs writer = ix.writer() for num,arq in enumerate(glob.glob("docs/*.txt")): # Abrir arquivo try: f = codecs.open(arq, "r", "UTF-8") except Exception: print 'Falha na leitura do arquivo: ', arq linhas = f.readlines()
class WhooshSearchBackend(BaseSearchBackend): query_compiler_class = WhooshSearchQueryCompiler results_class = WhooshSearchResults rebuilder_class = WhooshSearchRebuilder def __init__(self, params): super().__init__(params) self.params = params self.setup_complete = False self.use_file_storage = True self.post_limit = params.get("POST_LIMIT", 128 * 1024 * 1024) self.path = params.get("PATH") self.setup() self.refresh_index(optimize=False) def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) self.schema = self.build_schema() self.content_field_name = "text" self.parser = QueryParser(self.content_field_name, schema=self.schema) self.parser.add_plugins([FuzzyTermPlugin]) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self): schema_fields = { 'id': WHOOSH_ID(stored=True, unique=True), 'django_ct': WHOOSH_ID(stored=True), 'django_id': WHOOSH_ID(stored=True), 'text': TEXT(stored=True), } return Schema(**schema_fields) def get_config(self): return self.params.get('SEARCH_CONFIG') def get_index_for_model(self, model, db_alias=None): return Index(self, model, db_alias) def get_index_for_object(self, obj): return self.get_index_for_model(obj._meta.model, obj._state.db) def reset_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def add_type(self, model): pass # Not needed. def refresh_index(self, optimize=True): if not self.setup_complete: self.setup() else: self.index = self.index.refresh() if optimize: # optimize is a locking operation, shouldn't be called unless recreating the index self.index.optimize() def add(self, obj): self.get_index_for_object(obj).add_item(obj) def add_bulk(self, model, obj_list): if obj_list: self.get_index_for_object(obj_list[0]).add_items(model, obj_list) def delete(self, obj): self.get_index_for_object(obj).delete_item(obj)
class RedisWhooshStore(SAMLStoreBase ): # TODO: This needs a gc mechanism for keys (uuids) def json_dict(self, name): return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def xml_dict(self, name): return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', config.store_clear)) self._name = kwargs.pop('name', config.store_name) self._redis = kwargs.pop('redis', redis()) if clear: shutil.rmtree(self._dir) now = datetime.now() self._last_index_time = now self._last_modified = now self._setup() if clear: self.reset() def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warning(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex() def __getstate__(self): state = dict() for p in ('_dir', '_name', '_last_index_time', '_last_modified'): state[p] = getattr(self, p) return state def __setstate__(self, state): self.__dict__.update(state) self._setup() def __call__(self, *args, **kwargs): watched = kwargs.pop('watched', None) scheduler = kwargs.pop('scheduler', None) if watched is not None and scheduler is not None: super(RedisWhooshStore, self).__call__(watched=watched, scheduler=scheduler) log.debug("indexing using {}".format(scheduler)) if scheduler is not None: # and self._last_modified > self._last_index_time and : scheduler.add_job( RedisWhooshStore._reindex, args=[self], max_instances=1, coalesce=True, misfire_grace_time=2 * config.update_frequency, ) def _reindex(self): log.debug("indexing the store...") self._last_index_time = datetime.now() seen = set() refs = set([b2u(s) for s in self.objects.keys()]) parts = self.parts.values() for ref in refs: for part in parts: if ref in part['items']: seen.add(ref) ix = self.storage.open_index() lock = ix.lock("reindex") try: log.debug("waiting for index lock") lock.acquire(True) log.debug("got index lock") with ix.writer() as writer: for ref in refs: if ref not in seen: log.debug("removing unseen ref {}".format(ref)) del self.objects[ref] del self.parts[ref] log.debug("updating index") for e in self.objects.values(): info = self._index_prep(entity_simple_info(e)) ref = object_id(e) writer.add_document(object_id=ref, **info) writer.mergetype = CLEAR finally: try: log.debug("releasing index lock") lock.release() except ThreadError as ex: pass def dump(self): ix = self.storage.open_index() from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): res = dict() if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v content = " ".join( filter( lambda x: x is not None, [ info.get(x, '') for x in ('service_name', 'title', 'domain', 'keywords', 'scopes') ], )) res['content'] = content.strip() for a, v in info.items(): k = a if a in ATTRS_INV: k = ATTRS_INV[a] if k in self.schema.names(): if type(v) in (list, tuple): res[k] = " ".join([vv.lower() for vv in v]) elif type(v) in six.string_types: res[k] = info[a].lower() res['sha1'] = hash_id(info['entity_id'], prefix=False) return res def update(self, t, tid=None, etag=None, lazy=True): relt = root(t) assert relt is not None if relt.tag == "{%s}EntityDescriptor" % NS['md']: ref = object_id(relt) parts = None if ref in self.parts: parts = self.parts[ref] if etag is not None and (parts is None or parts.get('etag', None) != etag): self.parts[ref] = { 'id': relt.get('entityID'), 'etag': etag, 'count': 1, 'items': [ref] } self.objects[ref] = relt self._last_modified = datetime.now() elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') if etag is None: etag = hex_digest(dumptree(t, pretty_print=False), 'sha256') parts = None if tid in self.parts: parts = self.parts[tid] if parts is None or parts.get('etag', None) != etag: items = set() for e in iter_entities(t): ref = object_id(e) items.add(ref) self.objects[ref] = e self.parts[tid] = { 'id': tid, 'count': len(items), 'etag': etag, 'items': list(items) } self._last_modified = datetime.now() if not lazy: self._reindex() @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def collections(self): return [b2u(ref) for ref in self.parts.keys()] def reset(self): for k in ('{}_{}'.format(self._name, 'parts'), '{}_{}'.format(self._name, 'objects')): self._redis.delete('{}_{}'.format(self._name, 'parts')) self._redis.delete('{}_{}'.format(self._name, 'objects')) def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def _prep_key(self, key): # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') key = key.replace('-', ' AND NOT ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() return key def _entities(self): lst = set() for ref_data in self.parts.values(): for ref in ref_data['items']: e = self.objects.get(ref, None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def lookup(self, key): if key == 'entities' or key is None: return self._entities() bkey = six.b(key) if bkey in self.objects: return [self.objects.get(bkey)] if bkey in self.parts: res = [] part = self.parts.get(bkey) for item in part['items']: res.extend(self.lookup(item)) return res key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def search(self, query=None, path=None, entity_filter=None, related=None): if entity_filter: query = "{!s} AND {!s}".format(query, entity_filter) query = self._prep_key(query) qp = MultifieldParser(['content', 'domain'], schema=self.schema) q = qp.parse(query) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) log.debug(results) for result in results: lst.add(result['object_id']) res = list() for ref in lst: e = self.objects.get(ref, None) if e is not None: res.append(discojson(e)) return res
class SearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, site=None): super(SearchBackend, self).__init__(site) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024) if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'): raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.') def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH) if self.use_file_storage: self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { 'id': ID(stored=True, unique=True), 'django_ct': ID(stored=True), 'django_id': ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer()) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name) def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id)) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name)) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH): shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_unicode(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if len(sort_by) > 1 and reverse_counter > 1: raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if limit_to_registered_models: # Using narrow queries, limit the results to only models registered # with the current site. if narrow_queries is None: narrow_queries = set() registered_models = self.build_registered_models_list() if len(registered_models) > 0: narrow_queries.add('django_ct:(%s)' % ' OR '.join(registered_models)) if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq))) if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse) # Handle the case where the results have been narrowed. if narrowed_results: raw_results.filter(narrowed_results) # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = start_offset / page_length # Increment because Whoosh uses 1-based page numbers. page_num += 1 try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } return self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query) else: if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, limit_to_registered_models=None, **kwargs): warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2) return { 'results': [], 'hits': 0, } def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None sp = SpellChecker(self.storage) cleaned_query = force_unicode(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = sp.suggest(word, number=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = True else: value = False elif isinstance(value, (list, tuple)): value = u','.join([force_unicode(v) for v in value]) elif isinstance(value, (int, long, float)): # Leave it alone. pass else: value = force_unicode(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, basestring): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
class SearchIndex: def __init__(self, index_folder): self.storage = FileStorage(index_folder) if not self.storage.index_exists(): schema = Schema(fileid=ID(stored=True), content=TEXT(stored=True)) self.storage.create_index(schema) self.index = self.storage.open_index() def size(self): return self.index.doc_count() def list_files(self): with self.index.reader() as reader: for docnum, doc in reader.iter_docs(): yield doc['fileid'] def index_content(self, fileid, path_to_file, mimetype): """Index one file. """ with self.index.writer() as writer: self._index_content(fileid, path_to_file, mimetype, writer) def delete(self, fileid): """Delete one file. """ with self.index.searcher() as searcher: docnum = searcher.document_number(fileid=fileid) with self.index.writer() as writer: writer.delete_document(docnum) def _index_content(self, fileid, path_to_file, mimetype, writer): """Index one file. """ if not mimetype in EXTRACTORS: content = "Missing extractor for {}".format(mimetype) else: with open(path_to_file, 'rb') as f: document_bytes = f.read() magic = detect_from_content(document_bytes) content = EXTRACTORS[mimetype](path_to_file, document_bytes, magic) writer.add_document(fileid=fileid, content=content) def refresh(self, all_files, file_repo): """Extract the text from all the files in the repository, purging existing repo.""" with self.index.reader() as reader: docids = list(reader.all_doc_ids()) with self.index.writer() as writer: for docid in docids: writer.delete_document(docid) for node in all_files: self._index_content( node['id'], file_repo.get_absolute_path_to_file(node['number']), node['mimetype'], writer) def similarto(self, fileid, top=20, numterms=50): with self.index.searcher() as searcher: docnum = searcher.document_number(fileid=fileid) results = searcher.more_like(docnum, 'content', top=top, numterms=numterms, normalize=False) return list(map(lambda hit: hit['fileid'], results)) def search(self, query_str, limit=20): qp = QueryParser('content', schema=self.index.schema) query = qp.parse(query_str) with self.index.searcher() as searcher: results = searcher.search(query, limit=limit) return list(map(lambda hit: hit['fileid'], results)) def text(self, fileid): with self.index.searcher() as searcher: doc = searcher.document(fileid=fileid) if doc: return doc['content'] else: return "File {} not in index!".format(fileid)
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ("AND", "NOT", "OR", "TO") # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( "\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", ".", ) def __init__(self, connection_alias, **connection_options): super().__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024) self.path = connection_options.get("PATH") if connection_options.get("STORAGE", "file") != "file": self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger("haystack") def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, "RAM_STORE", None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) self.parser.add_plugins([FuzzyTermPlugin]) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for _, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost, ) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost, ) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost, ) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost, ) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=field_class.analyzer or StemmingAnalyzer(), field_boost=field_class.boost, sortable=True, ) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug("Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if "boost" in doc: del doc["boost"] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( "%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }, ) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() if writer.ident is not None: writer.join() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse('%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True, ) def clear(self, models=None, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query( q=self.parser.parse(" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Whoosh index of models '%s': %s", ",".join(models_to_delete), e, exc_info=True, ) else: self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if end_offset is not None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields="", highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return {"results": [], "hits": 0} query_string = force_str(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != "*": return {"results": [], "hits": 0} reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith("-"): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith("-"): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list group_by = [] facet_types = {} if facets is not None: group_by += [ FieldFacet(facet, allow_overlap=True, maptype=Count) for facet in facets ] facet_types.update({facet: "fields" for facet in facets}) if date_facets is not None: def _fixup_datetime(dt): if isinstance(dt, datetime): return dt if isinstance(dt, date): return datetime(dt.year, dt.month, dt.day) raise ValueError for key, value in date_facets.items(): start = _fixup_datetime(value["start_date"]) end = _fixup_datetime(value["end_date"]) gap_by = value["gap_by"] gap_amount = value.get("gap_amount", 1) gap = RelativeDelta(**{"%ss" % gap_by: gap_amount}) group_by.append( DateRangeFacet(key, start, end, gap, maptype=Count)) facet_types[key] = "dates" if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(" OR ".join( ["%s:%s" % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_str(nq)), limit=None) if len(recent_narrowed_results) <= 0: return {"results": [], "hits": 0} if narrowed_results is not None: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return {"results": [], "hits": 0} page_num, page_length = self.calculate_page( start_offset, end_offset) search_kwargs = { "pagelen": page_length, "sortedby": sort_by, "reverse": reverse, "groupedby": group_by, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs["filter"] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return {"results": [], "hits": 0, "spelling_suggestion": None} # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return {"results": [], "hits": 0, "spelling_suggestion": None} results = self._process_results( raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class, facet_types=facet_types, ) searcher.close() if hasattr(narrow_searcher, "close"): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) else: spelling_suggestion = None return { "results": [], "hits": 0, "spelling_suggestion": spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(" OR ".join( ["%s:%s" % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != "*": narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_str(nq)), limit=None) if len(recent_narrowed_results) <= 0: return {"results": [], "hits": 0} if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, "filter"): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return {"results": [], "hits": 0, "spelling_suggestion": None} # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return {"results": [], "hits": 0, "spelling_suggestion": None} results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, "close"): narrow_searcher.close() return results def _process_results( self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None, facet_types=None, ): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() facets = {} if facet_types: facets = { "fields": {}, "dates": {}, "queries": {}, } for facet_fieldname in raw_page.results.facet_names(): group = raw_page.results.groups(facet_fieldname) facet_type = facet_types[facet_fieldname] # Extract None item for later processing, if present. none_item = group.pop(None, None) lst = facets[facet_type][facet_fieldname] = sorted( group.items(), key=(lambda itm: (-itm[1], itm[0]))) if none_item is not None: # Inject None item back into the results. none_entry = (None, none_item) if not lst or lst[-1][1] >= none_item: lst.append(none_entry) else: for i, value in enumerate(lst): if value[1] < none_item: lst.insert(i, none_entry) break for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split(".") additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], "convert"): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) == 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ",") else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del additional_fields[DJANGO_CT] del additional_fields[DJANGO_ID] if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter("em") terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter, ) additional_fields["highlighted"] = { self.content_field_name: [whoosh_result] } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { "results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_str(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, "") for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, "") # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = " ".join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, "strftime"): if not hasattr(value, "hour"): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = "true" else: value = "false" elif isinstance(value, (list, tuple)): value = ",".join([force_str(v) for v in value]) elif isinstance(value, (int, float)): # Leave it alone. pass else: value = force_str(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == "true": return True elif value == "false": return False if value and isinstance(value, str): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime( date_values["year"], date_values["month"], date_values["day"], date_values["hour"], date_values["minute"], date_values["second"], ) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, int, float, complex), ): return converted_value except Exception: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def index_dict(dictionary): storage = FileStorage("index") ix = storage.open_index() add_to_index(ix, [dictionary])
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
app.logger.setLevel(logging.INFO) app.logger.info('microblog startup') enable_search = WHOOSH_ENABLED if enable_search: search_is_new = False if not os.path.exists(WHOOSH_BASE): os.mkdir(WHOOSH_BASE) search_is_new = True search_storage = FileStorage(WHOOSH_BASE) search_ix = None if search_is_new: schema = Schema(id=ID(stored=True), body=TEXT()) search_ix = search_storage.create_index(schema) else: search_ix = search_storage.open_index() class CustomJSONEncoder(JSONEncoder): """This class adds support for lazy translation texts to Flask's JSON encoder. This is necessary when flashing translated texts.""" def default(self, obj): from speaklater import is_lazy_string if is_lazy_string(obj): try: return unicode(obj) # python 2 except NameError: return str(obj) # python 3 return super(CustomJSONEncoder, self).default(obj)
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser import whoosh.qparser as qparser import chinese import os, glob, codecs, sys analyzer = chinese.ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), sub_title=TEXT(stored=True, analyzer=analyzer), author=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer)) storage = FileStorage("indexdir") ix = storage.open_index() writer = ix.writer() _string = sys.argv[1] _mode = sys.argv[2] normal = (_mode == "normal") _distance = 0 if(normal is False): _distance = int(sys.argv[3]) with ix.searcher() as searcher: # og = qparser.OrGroup.factory(0.9) parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema) # parser = qparser.QueryParser("content", ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin)
ix = index.create_in("indexdir", schema) 带开一个已经存在某个目录的索引,使用index.open_dir() [python] view plain copy import whoosh.index as index ix = index.open_dir("indexdir") 这些是便利方法: [python] view plain copy from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index ix = storage.create_index(schema) # Open an existing index storage.open_index() 你和index对象一起创建的schema对象是可序列化的并且和index一起存储 你可以在同一个目录下面使用多个索引,用关键字参数分开 [python] view plain copy # Using the convenience functions ix = index.create_in("indexdir", schema=schema, indexname="usages") ix = index.open_dir("indexdir", indexname="usages") # Using the Storage object ix = storage.create_index(schema, indexname="usages") ix = storage.open_index(indexname="usages") Clearing the index 在一个目录上调用index.craete_in()函数可以清除已经存在的索引的内容 可以用函数index.exist_in()来检测制定目录上面是否有一个有效的索引
def src(indexdir, quer, dsc=True, flds="2", lim=100, w="bm", opt=[], lo="o", resdir="", run=""): """ Questa funzione permette di effettuare il reperimento di risultati per query contenute nel file indicato in quer dall'indice indicato in indexdir, utilizzando oggetti e funzioni del modulo whoosh. Parameters ---------- indexdir : string Una stringa che indica il percorso della cartella contenente l'indice su cui effettuare la ricerca. quer : string Una stringa che indica il percorso del file contenente le query da utilizzare. dsc : bool Indica quale campo della query utilizzare, titolo o descrizione, di default usa la descrizione. flds : string Indica il numero di campi, puo' assumere solo valori "1", "2" o "3". lim : int Numero massimo di documenti reperiti per query deve essere un numero positivo. w : string Schema di pesatura utilizzato, puo' assumere solo valori "bm" o "tf". "bm" sta per BM25 mentre "tf" sta per TF_IDF (per ulteriori informazioni vedi whoosh.scorig) lo : string Operatore logico utilizzato per le parole delle query puo' assumere valori "o" per OR o "a" per AND. opt : list Dovrebbe contenere due valori di tipo numerico da assegnare ai parametri del BM25. resdir : string Una stringa che indica il percorso della cartella dove salvare il file dei risultati. Se striga vuota come da default i risultati vengono stampati con print. run : string Una stringa opzionale, permette di aggiungere parte del tag e del nome dei file dei risultati Returns ------- None Notes ----- Questa funzione e' stata fatta per effettuare ricerche in un indice della gia' citata collezione ohsumed, non e' garantito che funzioni per altri. In particolare il file delle query deve essere organizzato allo stesso modo del file contenente le query sperimentali per la collezione ohsumed (il quale si dovrebbe trovare nella stessa cartella di questo programma) e i documenti dell'indice dovrebbero avere i campi 'identifier', 'title', 'abstract' e 'terms'. """ fst = FileStorage(indexdir) ix = fst.open_index() # Creo il runtag utilizzando tutti i parametri che si possono cambiare tag = run + "_BATCH_DESC" + str(dsc)[0] + "_" + flds + "C_GRP" + lo.upper( ) + "_" + w.upper() + "_" + str(lim) + "RES" # ------------------------------------------------------------------------------------------------ # # Interpreta la scelta di quale operatore logico si usa per raggruppare le parole delle query if lo == "o": lgroup = qparser.OrGroup elif lo == "a": lgroup = qparser.AndGroup # ------------------------------------------------------------------------------------------------ # # Interpreta la scelta dello schema di peastura if w == "tf": score = scoring.TF_IDF() elif w == "bm": if opt: # opt dovrebbe contenere il punto che ottimizza un valore(come MAP) per i due parametri score = scoring.BM25F(opt[0], opt[1]) else: score = scoring.BM25F() # ------------------------------------------------------------------------------------------------ # # Interpeta il numero di campi dei documenti da utilizzare if flds == "1": campi = "title" parser = qp elif flds == "2": campi = ["title", "abstract"] parser = mp elif flds == "3": campi = ["title", "abstract", "terms"] parser = mp # ----------------------------------------------------------------------------------------------- # #--- apertura del file delle query ---# infile = open(quer, 'r') #--- lettura del file text = infile.read() #--- dom delle query dom = parseString(text) #--- estrazione dei dati della query title = gettagdata(dom, 'title') # Utilizzare il campo title delle query if dsc == True: title = gettagdata(dom, 'desc') # Utilizzare il campo desc delle query # Togliere i commenti dalle righe successive e commentare la riga prcedente per usare entrambi #desc = gettagdata(dom,'desc') #for x in range(len(title)-1): # title[x]=title[x]+" "+desc[x] num = gettagdata(dom, 'num') infile.close() # ------------------------------------------------------------------------------------------------- # # Apre il file dove inserire i risultati se esiste if resdir and os.path.exists(resdir): resfile = open( resdir + "/" + run + "_" + flds + "C" + ".treceval", 'w' ) # Se si cambiano piu' parametri e' consigliato usare la variabile tag al posto di run+"_"+flds+"C" per non rischiare di sovrascrivere risultati print "File dei risultati " + run + "_" + flds + "C" + ".treceval" else: print resdir, "does not exist" resdir = None # Effettua la ricerca per ogni query for qid in num[:]: title[int(qid) - 1].encode('utf-8') query = parser(campi, ix.schema, group=lgroup).parse(title[int(qid) - 1]) new_query = parser(campi, ix.schema, group=lgroup).parse( expq_cor(ix, query) ) # Corregge la query se le parole hanno una lettera sbagliata #print new_query results = ix.searcher(weighting=score).search( new_query, limit=lim) # Effettua la ricerca effettiva if results: if not resdir: # Stampa i risultati in console res(results, qid, lim, tag) else: # Stampa i risultati su file print "sta stampando i risultati della query " + qid + " su file" res(results, qid, lim, tag, resfile) else: print "non ha trovato risultati" resfile.close() ix.searcher().close() return None
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) # reset the writer so there is no 'start_doc' error from the # previous failed update attempt writer = AsyncWriter(self.index) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() try: if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s.%s" % (DJANGO_CT, model._meta.app_label, model._meta.module_name)) self.index.delete_by_query( q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to clear documents from Whoosh: %s", e) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted([ '%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models ]) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page( start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted([ '%s.%s' % (model._meta.app_label, model._meta.module_name) for model in models ]) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [ term.replace('*', '') for term in query_string.split() ] additional_fields['highlighted'] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter()) ], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
# # full text search # try: if not os.path.exists(FULLTEXT_INDEX_PATH): os.makedirs(FULLTEXT_INDEX_PATH, exist_ok=True) idx_storage = FileStorage(FULLTEXT_INDEX_PATH) if not index.exists_in(FULLTEXT_INDEX_PATH): print('creating new whoosh index') schema = getFTschema() idx = index.create_in(FULLTEXT_INDEX_PATH, schema) else: idx = idx_storage.open_index() idx_writer = idx.writer() for post in posts: keywords = " ".join(post.get_categories()) + ' ' + " ".join( post.get_keywords()) + ' ' + " ".join(post.get_tags()) # print(keywords) idx_writer.update_document( path=post.get_url(), content=post.get_raw_md().lower(), title=post.get_title().lower(), keywords=keywords, ) # TODO: pàgines a indexar? per exemple CKA
class SearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ("AND", "NOT", "OR", "TO") # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( "\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", ".", ) def __init__(self, site=None): super(SearchBackend, self).__init__(site) self.setup_complete = False if not hasattr(settings, "HAYSTACK_WHOOSH_PATH"): raise ImproperlyConfigured("You must specify a HAYSTACK_WHOOSH_PATH in your settings.") def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH ) self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = {"id": ID(stored=True, unique=True), "django_ct": ID(stored=True), "django_id": ID(stored=True)} # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for field_name, field_class in fields.items(): if isinstance(field_class, MultiValueField): schema_fields[field_name] = KEYWORD(stored=True, commas=True) elif isinstance(field_class, (DateField, DateTimeField, IntegerField, FloatField, BooleanField)): if field_class.indexed is False: schema_fields[field_name] = STORED else: schema_fields[field_name] = ID(stored=True) else: schema_fields[field_name] = TEXT(stored=True, analyzer=StemmingAnalyzer()) if field_class.document is True: content_field_name = field_name # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = self.index.writer() for obj in iterable: doc = index.prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name) def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id)) # For now, commit no matter what, as we run into locking issues otherwise. self.index.commit() def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name)) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) # For now, commit no matter what, as we run into locking issues otherwise. self.index.commit() def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if os.path.exists(settings.HAYSTACK_WHOOSH_PATH): shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH) # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() @log_query def search( self, query_string, sort_by=None, start_offset=0, end_offset=None, fields="", highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=True, **kwargs ): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return {"results": [], "hits": 0} query_string = force_unicode(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u"*": return {"results": [], "hits": 0} reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith("-"): reverse_counter += 1 if len(sort_by) > 1 and reverse_counter > 1: raise SearchBackendError( "Whoosh does not handle more than one field and any field being ordered in reverse." ) for order_by in sort_by: if order_by.startswith("-"): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models: # Using narrow queries, limit the results to only models registered # with the current site. if narrow_queries is None: narrow_queries = set() registered_models = self.build_registered_models_list() if len(registered_models) > 0: narrow_queries.add("django_ct:(%s)" % " OR ".join(registered_models)) if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq))) if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return {"results": [], "hits": 0} raw_results = searcher.search(parsed_query, sortedby=sort_by, reverse=reverse) # Handle the case where the results have been narrowed. if narrowed_results: raw_results.filter(narrowed_results) return self._process_results( raw_results, start_offset, end_offset, highlight=highlight, query_string=query_string, spelling_query=spelling_query, ) else: if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return {"results": [], "hits": 0, "spelling_suggestion": spelling_suggestion} def more_like_this(self, model_instance, additional_query_string=None): warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2) return {"results": [], "hits": 0} def _process_results( self, raw_results, start_offset, end_offset, highlight=False, query_string="", spelling_query=None ): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_results) raw_results = raw_results[start_offset:end_offset] facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_results): raw_result = dict(raw_result) app_label, model_name = raw_result["django_ct"].split(".") additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], "convert"): additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields["django_ct"]) del (additional_fields["django_id"]) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace("*", "") for term in query_string.split()] # DRL_FIXME: Highlighting doesn't seem to work properly in testing. additional_fields["highlighted"] = { self.content_field_name: [ highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter(), ) ] } # Requires Whoosh 0.1.20+. if hasattr(raw_results, "score"): score = raw_results.score(doc_offset) else: score = None if score is None: score = 0 result = SearchResult(app_label, model_name, raw_result["django_id"], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, "HAYSTACK_INCLUDE_SPELLING", False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return {"results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion} def create_spelling_suggestion(self, query_string): spelling_suggestion = None sp = SpellChecker(self.storage) cleaned_query = force_unicode(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, "") for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, "") # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = sp.suggest(word, number=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = " ".join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, "strftime"): if hasattr(value, "hour"): value = force_unicode(value.strftime("%Y-%m-%dT%H:%M:%S")) else: value = force_unicode(value.strftime("%Y-%m-%dT00:00:00")) elif isinstance(value, bool): if value: value = u"true" else: value = u"false" else: value = force_unicode(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == "true": return True elif value == "false": return False if value: possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime( date_values["year"], date_values["month"], date_values["day"], date_values["hour"], date_values["minute"], date_values["second"], ) try: # This is slightly gross but it's hard to tell otherwise what the # string's original type might have been. Be careful who you trust. converted_value = eval(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def get_index(): storage = FileStorage("index") return storage.open_index()
def whoosh_open_idx(idx_path, schema, indexname="content"): storage = FileStorage(idx_path) ix = storage.open_index(schema=schema, indexname=indexname) return ix
def openIndex(): storage = FileStorage("indexdir") ix = storage.open_index() return ix
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() try: if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to clear documents from Whoosh: %s", e) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page(start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page( parsed_query, page_num, **search_kwargs ) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter ) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
# 方法一 使用FileStorage对象 from whoosh.filedb.filestore import FileStorage storage = FileStorage('index') # idx_path 为索引路径 idx1 = storage.open_index(indexname='idx1') from whoosh import index # 方法二 使用open_dir函数 from whoosh.index import open_dir idx2 = open_dir('index', indexname='idx2') # indexname 为索引名 print(index.exists_in('index', indexname='idx2')) pass from whoosh.qparser import QueryParser, MultifieldParser, OrGroup, FieldsPlugin og = OrGroup.factory(0.9) qp = QueryParser("content", schema=idx1.schema) # group=OrGroup qp.remove_plugin_class(FieldsPlugin) q = qp.parse("reset") print(q) # mqp = MultifieldParser(["title", "content"], schema=idx1.schema) # mq = mqp.parse(u"many only") # # from whoosh.query import * # myquery = And([Term("title", u"third"), q]) # # myquery = Term("title", u"ird") # print(myquery) searcher = idx1.searcher() r = (searcher.search(q=q, limit=None)) print(len(r)) for hit in r: