def build_index(sa_session, whoosh_index_dir, path_to_repositories): """ Build the search indexes. One for repositories and another for tools within. """ # Rare race condition exists here and below if not os.path.exists(whoosh_index_dir): os.makedirs(whoosh_index_dir) tool_index_dir = os.path.join(whoosh_index_dir, 'tools') if not os.path.exists(tool_index_dir): os.makedirs(tool_index_dir) repo_index_storage = FileStorage(whoosh_index_dir) tool_index_storage = FileStorage(tool_index_dir) repo_index = repo_index_storage.create_index(repo_schema) tool_index = tool_index_storage.create_index(tool_schema) repo_index_writer = repo_index.writer() tool_index_writer = tool_index.writer() repos_indexed = 0 tools_indexed = 0 for repo in get_repos(sa_session, path_to_repositories): repo_index_writer.add_document( id=repo.get('id'), name=unicodify(repo.get('name')), description=unicodify(repo.get('description')), long_description=unicodify(repo.get('long_description')), homepage_url=unicodify(repo.get('homepage_url')), remote_repository_url=unicodify(repo.get('remote_repository_url')), repo_owner_username=unicodify(repo.get('repo_owner_username')), times_downloaded=repo.get('times_downloaded'), approved=repo.get('approved'), last_updated=repo.get('last_updated'), full_last_updated=repo.get('full_last_updated')) # Tools get their own index for tool in repo.get('tools_list'): tool_index_writer.add_document( id=unicodify(tool.get('id')), name=unicodify(tool.get('name')), version=unicodify(tool.get('version')), description=unicodify(tool.get('description')), help=unicodify(tool.get('help')), repo_owner_username=unicodify(repo.get('repo_owner_username')), repo_name=unicodify(repo.get('name')), repo_id=repo.get('id')) tools_indexed += 1 print(tools_indexed, 'tools (', tool.get('id'), ')') repos_indexed += 1 print(repos_indexed, 'repos (', repo.get('id'), ')') tool_index_writer.commit() repo_index_writer.commit() print("TOTAL repos indexed: ", repos_indexed) print("TOTAL tools indexed: ", tools_indexed)
def build_index(sa_session, whoosh_index_dir, path_to_repositories): """ Build the search indexes. One for repositories and another for tools within. """ # Rare race condition exists here and below if not os.path.exists(whoosh_index_dir): os.makedirs(whoosh_index_dir) tool_index_dir = os.path.join(whoosh_index_dir, 'tools') if not os.path.exists(tool_index_dir): os.makedirs(tool_index_dir) repo_index_storage = FileStorage(whoosh_index_dir) tool_index_storage = FileStorage(tool_index_dir) repo_index = repo_index_storage.create_index(repo_schema) tool_index = tool_index_storage.create_index(tool_schema) repo_index_writer = repo_index.writer() tool_index_writer = tool_index.writer() repos_indexed = 0 tools_indexed = 0 for repo in get_repos(sa_session, path_to_repositories): repo_index_writer.add_document(id=repo.get('id'), name=unicodify(repo.get('name')), description=unicodify(repo.get('description')), long_description=unicodify(repo.get('long_description')), homepage_url=unicodify(repo.get('homepage_url')), remote_repository_url=unicodify(repo.get('remote_repository_url')), repo_owner_username=unicodify(repo.get('repo_owner_username')), times_downloaded=repo.get('times_downloaded'), approved=repo.get('approved'), last_updated=repo.get('last_updated'), full_last_updated=repo.get('full_last_updated')) # Tools get their own index for tool in repo.get('tools_list'): tool_index_writer.add_document(id=unicodify(tool.get('id')), name=unicodify(tool.get('name')), version=unicodify(tool.get('version')), description=unicodify(tool.get('description')), help=unicodify(tool.get('help')), repo_owner_username=unicodify(repo.get('repo_owner_username')), repo_name=unicodify(repo.get('name')), repo_id=repo.get('id')) tools_indexed += 1 print(tools_indexed, 'tools (', tool.get('id'), ')') repos_indexed += 1 print(repos_indexed, 'repos (', repo.get('id'), ')') tool_index_writer.commit() repo_index_writer.commit() print("TOTAL repos indexed: ", repos_indexed) print("TOTAL tools indexed: ", tools_indexed)
def _get_index(self, language=None): storage = FileStorage(self._index_dir).create() if storage.index_exists(): ix = storage.open_index() else: ix = storage.create_index(self._get_schema(language)) return ix
def build_index(): """building the index from scratch""" print "building index.." index_dir = PYTHON_SEARCH_DIR if TEST_COLLECTION: index_dir = PYTHON_SEARCH_DIR_TEST #CR_DOCS_DB.drop() #CR_DOCS_DB.ensure_index("code_id", unique=True) if os.path.exists(index_dir): shutil.rmtree(index_dir) os.mkdir(index_dir) schema = get_schema() storage = FileStorage(index_dir) ix = storage.create_index(schema) w = ix.writer() print "finding posts.." posts_with_code = POSTS_DB.find({"answers.Body": {"$regex": "/.*<code>.*/"}}, timeout=False) print "adding files.." q = 0 for i, question in enumerate(posts_with_code): q += add_doc(w, question) if i % 1000 == 0 and not i == 0: print "commit number:", str(i / 1000), "with", q, "codes" w.commit() w = ix.writer() w.commit() posts_with_code.close() print "the index was built!" return ix
def build_index(sa_session, toolshed_whoosh_index_dir): storage = FileStorage(toolshed_whoosh_index_dir) index = storage.create_index(schema) writer = index.writer() def to_unicode(a_basestr): if type(a_basestr) is str: return unicode(a_basestr, 'utf-8') else: return a_basestr repos_indexed = 0 for (id, name, description, long_description, homepage_url, remote_repository_url, repo_owner_username, times_downloaded, approved, last_updated, full_last_updated) in get_repos(sa_session): writer.add_document( id=id, name=to_unicode(name), description=to_unicode(description), long_description=to_unicode(long_description), homepage_url=to_unicode(homepage_url), remote_repository_url=to_unicode(remote_repository_url), repo_owner_username=to_unicode(repo_owner_username), times_downloaded=times_downloaded, approved=approved, last_updated=last_updated, full_last_updated=full_last_updated) repos_indexed += 1 writer.commit() print "Number of repos indexed: ", repos_indexed
def __init__(self, path: Path): storage = FileStorage(fspath(path)) if path.exists(): self.ix = storage.open_index() else: path.mkdir(exist_ok=True, parents=True) self.ix = storage.create_index(self.schema)
def create_index(sender=None, **kwargs): """Creates a File based whoosh index, location used is settings.WHOOSH_INDEX so make sure that is set""" if not os.path.exists(settings.WHOOSH_INDEX): os.mkdir(settings.WHOOSH_INDEX) storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.create_index(schema=WHOOSH_SCHEMA, indexname="search")
def init_index(index=".index"): indexZ=index if not os.path.exists(indexZ): os.mkdir(indexZ) # os.rmdir(index) storage = FileStorage(indexZ) schema = Schema(name=TEXT(stored=True),ext=KEYWORD,title=TEXT(stored=True),content=TEXT,path=ID (stored=True),tags=KEYWORD) ix = storage.create_index(schema) ix = storage.open_index() return ix
def _setup(self, storage_directory): schema = fields.Schema( oid=fields.ID(stored=True, unique=True), name=fields.ID()) schema.add('*', fields.TEXT, glob=True) if storage_directory: if os.path.exists(storage_directory): self._using_existing_index = True storage = FileStorage(storage_directory) ix = storage.open_index() else: os.mkdir(storage_directory) storage = FileStorage(storage_directory) ix = storage.create_index(schema) else: storage = RamStorage() ix = storage.create_index(schema) return (schema, ix)
def get_index(index, schema, refresh=False): index_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), index) storage = FileStorage(index_dir) if exists_in(index_dir) and not refresh: ix = storage.open_index() else: # os.mkdir(index_dir) st = FileStorage(index_dir).create() ix = st.create_index(schema) return ix
def get_myindex(indexdir='indexdir', filestore=False): schema = get_schema() if not filestore: if not os.path.exists(indexdir): os.mkdir(indexdir) ix = index.create_in(indexdir, schema) ix = index.open_dir(indexdir) else: storage = FileStorage(indexdir) # TODO: When the indexdir has already exist # the index object also use create_index, # it should use open_dir as above method. ix = storage.create_index(schema) return ix
def get_index(name, schema, path, clean=False): # create dir if not os.path.exists(path): os.makedirs(path) storage = FileStorage(path) # Create an index object try: if clean: raise EmptyIndexError() return storage.open_index(indexname=name) except EmptyIndexError: return storage.create_index(schema, indexname=name)
def init_index(index=".index"): indexZ = index if not os.path.exists(indexZ): os.mkdir(indexZ) # os.rmdir(index) storage = FileStorage(indexZ) schema = Schema(name=TEXT(stored=True), ext=KEYWORD, title=TEXT(stored=True), content=TEXT, path=ID(stored=True), tags=KEYWORD) ix = storage.create_index(schema) ix = storage.open_index() return ix
def indexLibraries(db, index_field="name", index_folder="index_fullname"): print("whoosh Indexing") schema = Schema(fullname=TEXT(analyzer=StemmingAnalyzer(), spelling=True), id=TEXT(stored=True)) dirname = os.path.dirname(os.path.abspath(__file__)) st = FileStorage(os.path.join(dirname, index_folder)).create() index = st.create_index(schema) posts = db.AQLQuery("FOR library in libraries RETURN library", rawResults=True, batchSize=10000) writer = index.writer() for post in posts: writer.update_document(fullname=post[index_field], id=post["_id"]) writer.commit()
def create_index(request): analyzer = ChineseAnalyzer() schema = Schema(ids=TEXT(stored=True, analyzer=analyzer), name=TEXT(stored=True, analyzer=analyzer)) file_storage = FileStorage('D:/Chihuo/shopping_mall/index') if not os.path.exists('D:/Chihuo/shopping_mall/index'): os.mkdir('D:/Chihuo/shopping_mall/index') ix = file_storage.create_index(schema) else: ix = file_storage.open_index() writer = ix.writer() for goods in Goods.objects.all(): writer.add_document(ids=str(goods.id), name=goods.name) print(goods.name) writer.commit() return HttpResponse('创建完成')
def BuiltIndex(self): analyzer = ChineseAnalyzer() # define schema schema = Schema(title=TEXT(sortable=True), zb_url=TEXT(sortable=True), ctime=TEXT(sortable=True), deadline=TEXT(sortable=True), bsdeadline=TEXT(sortable=True), dbtb=TEXT(sortable=True), content=TEXT(sortable=True, analyzer=analyzer), lettercard=TEXT(sortable=True, analyzer=analyzer)) dirname = './whoosh_index' storage = FileStorage(dirname) if not os.path.exists(dirname): os.mkdir(dirname) # create index file ix = storage.create_index(schema, indexname='Hello') else: ix = storage.open_index(indexname='Hello') writer = ix.writer() # fetch rows from DB num = 0 try: with connection.cursor() as cursor: for tbname in self.Gettabs(): sql = '''SELECT `title`, `zb_url`, `ctime`, `deadline`, `bsdeadline`,`dbtb`, `content`, `lettercard` FROM {}'''.format( tbname) cursor.execute(sql) rows = cursor.fetchall() # write the rows into indexes for row in rows: writer.add_document(title=str(row["title"]), zb_url=str(row["zb_url"]), ctime=str(row["ctime"]), deadline=str(row['deadline']), bsdeadline=str(row['bsdeadline']), dbtb=str(row["dbtb"]), content=str(row["content"]), lettercard=str(row["lettercard"])) num += 1 writer.commit() finally: connection.close() print("%d docs indexed!" % num)
def create_in(dirname, schema, indexname=None): """Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. indexname is t :param dirname: the path string of the directory in which to create the index. :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage object. :returns: :class:`Index` """ if not indexname: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname) return storage.create_index(schema, indexname)
def create_in(dirname, schema, indexname=None): """Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. indexname is t :param dirname: the path string of the directory in which to create the index. :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage object. :returns: :class:`Index` """ if not indexname: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname) return storage.create_index(schema, indexname)
def create_index(request): analyzer = ChineseAnalyzer() schema = Schema(name=TEXT(stored=True, analyzer=analyzer), datail=TEXT(stored=True, analyzer=analyzer), ids=TEXT(stored=True)) file_storage = FileStorage('./index') if not os.path.exists('./index'): os.mkdir('./index') ix = file_storage.create_index(schema) else: ix = file_storage.open_index() writer = ix.writer() for goods in Good_type.objects.all(): writer.add_document(name=goods.name, datail=goods.note, ids=str(goods.id)) writer.commit() return HttpResponse('索引创建完成')
def create_in(dirname, schema, indexname=None, byteorder=None): """Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. indexname is t :param dirname: the path string of the directory in which to create the index. :param schema: a :class:`whoosh.fields.Schema` object describing the index's fields. :param indexname: the name of the index to create; you only need to specify this if you are creating multiple indexes within the same storage object. :param byteorder: the byte order to use when writing numeric values to disk: 'big', 'little', or None. If None (the default), Whoosh uses the native platform order. :returns: :class:`Index` """ if not indexname: indexname = _DEF_INDEX_NAME from whoosh.filedb.filestore import FileStorage storage = FileStorage(dirname, byteorder=byteorder) return storage.create_index(schema, indexname)
def __init__(self, index_dir="whoosh_index", schema_type="", schema_name="default_schema"): self.index_dir = index_dir self.schema_type = schema_type self.schema_name = schema_name self.schema_dir = self.index_dir + "/" + self.schema_name self.search_limit = 100 analyzer = analysis.StandardAnalyzer(stoplist=frozenset([])) # create schema if self.schema_type == "dialogs": self.schema = Schema(dialog=TEXT(analyzer=analyzer, stored=True), lang=ID(stored=True), turn=NUMERIC(stored=True), vector=STORED) elif self.schema_type == "embedding": self.schema = Schema(key=ID(stored=True), vector=STORED) # create index if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) if not os.path.exists(self.schema_dir): os.mkdir(self.schema_dir) # create / load index storage = FileStorage(self.schema_dir) # check index exists if storage.index_exists(): print('index exists, loading.') # open self.ix = storage.open_index() else: print('index doesn\'t exists, creating.') # create self.ix = storage.create_index(self.schema) # open index directory # self.ix = open_dir(self.schema_dir) self.writer = None
def build_index( sa_session, whoosh_index_dir ): storage = FileStorage( whoosh_index_dir ) index = storage.create_index( schema ) writer = index.writer() def to_unicode( a_basestr ): if type( a_basestr ) is str: return unicode( a_basestr, 'utf-8' ) else: return a_basestr lddas_indexed = 0 for id, name, info, dbkey, message in get_lddas( sa_session ): writer.add_document( id=id, name=to_unicode( name ), info=to_unicode( info ), dbkey=to_unicode( dbkey ), message=to_unicode( message ) ) lddas_indexed += 1 writer.commit() print "Number of active library datasets indexed: ", lddas_indexed
def newIndex(): ''' newIndex() Creates the index/schema for the Whoosh module INPUTS: (none) OUTPUTS: idx -- index ''' print '\tCreating a new Index in the current directory' # Create an index to store the artist/title and lyrics schm = Schema(Name=TEXT(stored=True), Ingr=KEYWORD(stored=True, commas=True)) # Create a directory called FAR_Storage; will contain the index # See Whoosh documentation for more information if not os.path.exists('FAR_Storage'): os.mkdir('FAR_Storage') idxDir ='FAR_Storage' storage = FileStorage(idxDir) idx = storage.create_index(schm, indexname='FAR') idx = storage.open_index(indexname = 'FAR') return idx
def create_index(index_dir, data_dir): schema = Schema(path=ID(stored=True, unique=True), content=TEXT(stored=True)) storage_obj = FileStorage(index_dir) if whoosh.index.exists_in(index_dir): try: shutil.rmtree(index_dir) os.makedirs(index_dir) except: raise PermissionError( _("Das Index-Verzeichnis konnte nicht erstellt werden")) idx = storage_obj.create_index(schema) writer = idx.writer() # Iteriere über alle Dateien die auf .md enden for (path, dirs, files) in os.walk(data_dir): # Remove the git-Folder if '.git' in dirs: dirs.remove('.git') for article in files: if article.endswith('.md'): article_path = os.path.join( os.path.relpath(path, data_dir).strip('./'), article) try: # Get file content with codecs.open(os.path.join(path, article), "r", "utf-8") as f: content = f.read() writer.add_document(path=article_path, content=content) except: continue writer.commit() return True
def test_storage_creation(): import tempfile, uuid from whoosh import fields from whoosh.filedb.filestore import FileStorage schema = fields.Schema(text=fields.TEXT) uid = uuid.uuid4() dirpath = os.path.join(tempfile.gettempdir(), str(uid)) assert not os.path.exists(dirpath) st = FileStorage(dirpath) st.create() assert os.path.exists(dirpath) ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo")) w.add_document(text=u("bracho charlie")) st.destroy() assert not os.path.exists(dirpath)
def build_index(sa_session, whoosh_index_dir): storage = FileStorage(whoosh_index_dir) index = storage.create_index(schema) writer = index.writer() def to_unicode(a_basestr): if not isinstance(a_basestr, text_type): return text_type(a_basestr, 'utf-8') else: return a_basestr lddas_indexed = 0 for id, name, info, dbkey, message in get_lddas(sa_session): writer.add_document(id=id, name=to_unicode(name), info=to_unicode(info), dbkey=to_unicode(dbkey), message=to_unicode(message)) lddas_indexed += 1 writer.commit() print("Number of active library datasets indexed: ", lddas_indexed)
def test_storage_creation(): import tempfile, uuid from whoosh import fields from whoosh.filedb.filestore import FileStorage schema = fields.Schema(text=fields.TEXT) uid = uuid.uuid4() dirpath = os.path.join(tempfile.gettempdir(), str(uid)) assert not os.path.exists(dirpath) st = FileStorage(dirpath) st.create() assert os.path.exists(dirpath) ix = st.create_index(schema) with ix.writer() as w: w.add_document(text=u("alfa bravo")) w.add_document(text=u("bracho charlie")) st.destroy() assert not os.path.exists(dirpath)
def build_index( sa_session, toolshed_whoosh_index_dir ): storage = FileStorage( toolshed_whoosh_index_dir ) index = storage.create_index( schema ) writer = index.writer() def to_unicode( a_basestr ): if type( a_basestr ) is str: return unicode( a_basestr, 'utf-8' ) else: return a_basestr repos_indexed = 0 for ( id, name, description, long_description, homepage_url, remote_repository_url, repo_owner_username, times_downloaded, approved, last_updated, full_last_updated ) in get_repos( sa_session ): writer.add_document( id = id, name = to_unicode( name ), description = to_unicode( description ), long_description = to_unicode( long_description ), homepage_url = to_unicode( homepage_url ), remote_repository_url = to_unicode( remote_repository_url ), repo_owner_username = to_unicode( repo_owner_username ), times_downloaded = times_downloaded, approved = approved, last_updated = last_updated, full_last_updated = full_last_updated ) repos_indexed += 1 writer.commit() print "Number of repos indexed: ", repos_indexed
def index(self): if self.empty: if not os.path.exists(self.folder): os.makedirs(self.folder) st = FileStorage(self.folder) ix = st.create_index(self.schema) w = ix.writer() w.add_document(name = u"beuha") pipe = file.ID3Filter() #[TODO] using itunes info for artwork? cpt = 0 for track in pipe(ItunesParser(self.path)): if track['album'] != None : album = track['album'].encode('ascii', 'ignore') else: album = "" #print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore') if cpt % 20 == 0: print "\n%i " %cpt, print '#', #print track['album'], track['name'] w.add_document( trackId = track['trackId'], name=track['name'] ,artist=track['artist'], album=track['album'], genre=track['genre'], location=track['location'], artwork=boolean(track['artwork']), trackNumber=track['trackNumber'], bitRate=track['bitRate'] ) #if cpt % 100 == 1: # w.commit() cpt += 1 print "\n\n%i tracks indexed" % cpt w.commit() ix.optimize() ix.close() else : print "already indexed"
class TinaIndex(): """ Open or Create a whoosh index Provides searching methods """ def __init__( self, indexdir ): self.writer = None self.reader = None self.searcher = None self.indexdir = indexdir self.storage = FileStorage(self.indexdir) self.index = None try: self.index = self.storage.open_index() except EmptyIndexError, e: _logger.warning( "No existing index at %s : "%self.indexdir) self.schema = TinaSchema() if not os.path.exists(self.indexdir): os.mkdir(self.indexdir) self.index = self.storage.create_index(self.schema) except LockError, le: _logger.error("index LockError %s : "%self.indexdir) raise LockError(le)
def make_index(self, dirname, schema, ixname): if not exists(dirname): mkdir(dirname) st = FileStorage(dirname) ix = st.create_index(schema, indexname = ixname) return ix
if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) 带开一个已经存在某个目录的索引,使用index.open_dir() [python] view plain copy import whoosh.index as index ix = index.open_dir("indexdir") 这些是便利方法: [python] view plain copy from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index ix = storage.create_index(schema) # Open an existing index storage.open_index() 你和index对象一起创建的schema对象是可序列化的并且和index一起存储 你可以在同一个目录下面使用多个索引,用关键字参数分开 [python] view plain copy # Using the convenience functions ix = index.create_in("indexdir", schema=schema, indexname="usages") ix = index.open_dir("indexdir", indexname="usages") # Using the Storage object ix = storage.create_index(schema, indexname="usages") ix = storage.open_index(indexname="usages") Clearing the index
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
def create_index(sender=None, **kwargs): if not os.path.exists(settings.WHOOSH_INDEX): os.mkdir(settings.WHOOSH_INDEX) storage = FileStorage(settings.WHOOSH_INDEX) ix = storage.create_index(schema=WHOOSH_SCHEMA)
class Library(object): RESULTS_LIMIT = 700 FUZZY_LIMIT = 5 SUGGESTIONS_LIMIT = 5 def __init__(self, dbsession, **settings): """Initializes Whoosh by setting up and loading indexes for lookup.""" self._dbsession = dbsession self.schema = ChipSchema() self.directory = settings.get( 'whoosh.store', os.path.join(settings['config_path'], 'whoosh-data') ) self.indexname = settings.get( 'whoosh.indexname', 'chips' ) self.rebuild = asbool(settings.get('whoosh.rebuild', 'false')) self.storage = FileStorage(self.directory) self.setindex() if self.rebuild: self.setindex() self.buildindex() else: self.setindex() def setindex(self): if self.rebuild and os.path.exists(self.directory): shutil.rmtree(self.directory) if not os.path.exists(self.directory): os.mkdir(self.directory) if whoosh.index.exists_in( self.directory, indexname=self.indexname ): if self.rebuild: shutil.rmtree(self.directory) self.setindex() else: self.index = self.storage.open_index(indexname=self.indexname) else: self.index = self.storage.create_index( self.schema, indexname=self.indexname ) def buildindex(self): q = self._dbsession.query(Chip).all() writer = self.index.writer() for chip in q: try: version = chip.version.name except AttributeError: version = '' writer.add_document( id=str(chip.id), indice=str(chip.indice), indice_game=str(chip.indice_game), name=chip.name.lower(), name_jp=chip.name_jp, name_display=chip.name, game=chip.game.name.lower(), game_enum=chip.game, version=version, version_enum=chip.version, classification=chip.classification.name, classification_enum=chip.classification, element=chip.element.name, element_enum=chip.element, description=chip.description, code=','.join(chip.codes_iter()).lower(), size=str(chip.size), damage_min=str(chip.damage_min), damage_max=str(chip.damage_max), recovery=str(chip.recovery), rarity=str(chip.rarity) ) writer.commit(writing.CLEAR) def lookup(self, term, fuzzy=False, limit=None): term = term.strip() term = term.lower() if limit: limit = limit else: limit = self.RESULTS_LIMIT fields = ( 'indice', 'indice_game', 'name', 'name_jp', 'game', 'version', 'classification', 'element', 'code', 'size', 'damage_min', 'damage_max', 'recovery', 'rarity' ) if fuzzy: parser = MultifieldParser( fields, schema=self.index.schema, termclass=FuzzyTerm ) else: parser = MultifieldParser(fields, schema=self.index.schema) operators = OperatorsPlugin( And="&", Or="\\|", AndNot="&!", AndMaybe="&~", Not="\\-" ) parser.replace_plugin(operators) query = parser.parse(term) results = [] try: searcher = self.index.searcher() results = searcher.search(query, limit=limit) if not results and not fuzzy: # Try a Fuzzy Search. return self.lookup(term, fuzzy=True, limit=self.FUZZY_LIMIT) except IndexError: pass return results
def build_whoosh_index_cron(): """ 建立全文搜索索引 """ from web.utils import whoosh_site_schema, whoosh_article_schema from whoosh.filedb.filestore import FileStorage from whoosh.qparser import QueryParser idx_dir = settings.WHOOSH_IDX_DIR first_boot = False if not os.path.exists(idx_dir): os.makedirs(idx_dir) first_boot = True storage = FileStorage(idx_dir) # 索引站点 if first_boot: idx = storage.create_index(whoosh_site_schema, indexname="site") else: idx = storage.open_index(indexname="site", schema=whoosh_site_schema) idx_writer = idx.writer() for site_id in get_active_sites(): # 判断是否已经索引 if is_indexed('site', site_id) and not first_boot: continue try: site = Site.objects.get(pk=site_id, status='active') except: continue cname = split_cn_words(site.cname, join=True) author = site.author or '' brief = split_cn_words(site.brief, join=True) logger.info(f"源分词结果:`{site_id}`{cname}`{brief}") try: idx_writer.add_document(id=site_id, cname=cname, author=author, brief=brief) set_indexed('site', site_id) except: logger.warning(f"源索引失败:`{site_id}") idx_writer.commit() # 索引文章 if first_boot: idx = storage.create_index(whoosh_article_schema, indexname="article") else: idx = storage.open_index(indexname="article", schema=whoosh_article_schema) idx_writer = idx.writer() for uindex in get_recent_articles(): # 判断是否已经索引 if is_indexed('article', uindex) and not first_boot: continue try: article = Article.objects.get(uindex=uindex, status='active') except: continue content = get_content(uindex, article.site_id) if content: title = split_cn_words(article.title, join=True) author = article.author or '' content_soup = BeautifulSoup(content, 'html.parser') content = split_cn_words(content_soup.get_text(), join=True, limit=20) logger.info(f"文章分词结果:`{uindex}`{title}") try: idx_writer.add_document(uindex=uindex, title=title, author=author, content=content) set_indexed('article', uindex) except: logger.warning(f"文章索引失败:`{uindex}") idx_writer.commit() # 清理过期文章 idx = storage.open_index(indexname="article", schema=whoosh_article_schema) idx_writer = idx.writer() lastweek_ts = str(current_ts() - 7 * 86400 * 1000) query = QueryParser("uindex", idx.schema).parse('uindex:{to %s]' % lastweek_ts) with idx.searcher() as searcher: idx_writer.delete_by_query(query, searcher) idx_writer.commit() return True
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() try: if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to clear documents from Whoosh: %s", e) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page(start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page( parsed_query, page_num, **search_kwargs ) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # Deferred models will have a different class ("RealClass_Deferred_fieldname") # which won't be in our registry: model_klass = model_instance._meta.concrete_model field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields[DJANGO_CT]) del(additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter ) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def build_index(sa_session, whoosh_index_dir, path_to_repositories, hgweb_config_dir): """ Build the search indexes. One for repositories and another for tools within. """ # Rare race condition exists here and below tool_index_dir = os.path.join(whoosh_index_dir, 'tools') if not os.path.exists(whoosh_index_dir): os.makedirs(whoosh_index_dir) os.makedirs(tool_index_dir) work_repo_dir = whoosh_index_dir work_tool_dir = tool_index_dir else: # Index exists, prevent in-place index regeneration work_repo_dir = tempfile.mkdtemp(prefix="tmp-whoosh-repo") work_tool_dir = tempfile.mkdtemp(prefix="tmp-whoosh-tool") repo_index_storage = FileStorage(work_repo_dir) tool_index_storage = FileStorage(work_tool_dir) repo_index = repo_index_storage.create_index(repo_schema) tool_index = tool_index_storage.create_index(tool_schema) repo_index_writer = repo_index.writer() tool_index_writer = tool_index.writer() repos_indexed = 0 tools_indexed = 0 for repo in get_repos(sa_session, path_to_repositories, hgweb_config_dir): repo_index_writer.add_document( id=repo.get('id'), name=unicodify(repo.get('name')), description=unicodify(repo.get('description')), long_description=unicodify(repo.get('long_description')), homepage_url=unicodify(repo.get('homepage_url')), remote_repository_url=unicodify(repo.get('remote_repository_url')), repo_owner_username=unicodify(repo.get('repo_owner_username')), categories=unicodify(repo.get('categories')), times_downloaded=repo.get('times_downloaded'), approved=repo.get('approved'), last_updated=repo.get('last_updated'), full_last_updated=repo.get('full_last_updated'), repo_lineage=unicodify(repo.get('repo_lineage'))) # Tools get their own index for tool in repo.get('tools_list'): tool_index_writer.add_document( id=unicodify(tool.get('id')), name=unicodify(tool.get('name')), version=unicodify(tool.get('version')), description=unicodify(tool.get('description')), help=unicodify(tool.get('help')), repo_owner_username=unicodify(repo.get('repo_owner_username')), repo_name=unicodify(repo.get('name')), repo_id=repo.get('id')) tools_indexed += 1 print(tools_indexed, 'tools (', tool.get('id'), ')') repos_indexed += 1 print(repos_indexed, 'repos (', repo.get('id'), ')') tool_index_writer.commit() repo_index_writer.commit() print("TOTAL repos indexed: ", repos_indexed) print("TOTAL tools indexed: ", tools_indexed) # Copy the built indexes if we were working in a tmp folder if work_repo_dir is not whoosh_index_dir: shutil.rmtree(whoosh_index_dir) os.makedirs(whoosh_index_dir) os.makedirs(tool_index_dir) copy_tree(work_repo_dir, whoosh_index_dir) copy_tree(work_tool_dir, tool_index_dir) shutil.rmtree(work_repo_dir)
def create_indexes(cls): storage = FileStorage(index_dir) storage.create_index(SongIndexSchema, indexname="songs")
stream_handler = logging.StreamHandler() app.logger.addHandler(stream_handler) app.logger.setLevel(logging.INFO) app.logger.info('microblog startup') enable_search = WHOOSH_ENABLED if enable_search: search_is_new = False if not os.path.exists(WHOOSH_BASE): os.mkdir(WHOOSH_BASE) search_is_new = True search_storage = FileStorage(WHOOSH_BASE) search_ix = None if search_is_new: schema = Schema(id=ID(stored=True), body=TEXT()) search_ix = search_storage.create_index(schema) else: search_ix = search_storage.open_index() class CustomJSONEncoder(JSONEncoder): """This class adds support for lazy translation texts to Flask's JSON encoder. This is necessary when flashing translated texts.""" def default(self, obj): from speaklater import is_lazy_string if is_lazy_string(obj): try: return unicode(obj) # python 2 except NameError: return str(obj) # python 3 return super(CustomJSONEncoder, self).default(obj)
def create_index(sender=None, **kwargs): if not os.path.exists(settings.WHOOSH_INDEX): os.mkdir(settings.WHOOSH_INDEX) storage = FileStorage(settings.WHOOSH_INDEX) storage.create_index(WHOOSH_SCHEMA, indexname='memopol')
class RedisWhooshStore(SAMLStoreBase ): # TODO: This needs a gc mechanism for keys (uuids) def json_dict(self, name): return LRUProxyDict(JSONDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def xml_dict(self, name): return LRUProxyDict(XMLDict(key='{}_{}'.format(self._name, name), redis=self._redis, writeback=True), maxsize=config.cache_size) def __init__(self, *args, **kwargs): self._dir = kwargs.pop('directory', '.whoosh') clear = bool(kwargs.pop('clear', config.store_clear)) self._name = kwargs.pop('name', config.store_name) self._redis = kwargs.pop('redis', redis()) if clear: shutil.rmtree(self._dir) now = datetime.now() self._last_index_time = now self._last_modified = now self._setup() if clear: self.reset() def _setup(self): self._redis = getattr(self, '_redis', None) if not self._redis: self._redis = redis( ) # XXX test cases won't get correctly unpicked because of this self.schema = Schema(content=NGRAMWORDS(stored=False)) self.schema.add("object_id", ID(stored=True, unique=True)) self.schema.add("entity_id", ID(stored=True, unique=True)) self.schema.add('sha1', ID(stored=True, unique=True)) for a in list(ATTRS.keys()): self.schema.add(a, KEYWORD()) self.objects = self.xml_dict('objects') self.parts = self.json_dict('parts') self.storage = FileStorage(os.path.join(self._dir, self._name)) try: self.index = self.storage.open_index(schema=self.schema) except BaseException as ex: log.warn(ex) self.storage.create() self.index = self.storage.create_index(self.schema) self._reindex() def __getstate__(self): state = dict() for p in ('_dir', '_name', '_last_index_time', '_last_modified'): state[p] = getattr(self, p) return state def __setstate__(self, state): self.__dict__.update(state) self._setup() def __call__(self, *args, **kwargs): watched = kwargs.pop('watched', None) scheduler = kwargs.pop('scheduler', None) if watched is not None and scheduler is not None: super(RedisWhooshStore, self).__call__(watched=watched, scheduler=scheduler) log.debug("indexing using {}".format(scheduler)) if scheduler is not None: # and self._last_modified > self._last_index_time and : scheduler.add_job(RedisWhooshStore._reindex, args=[self], max_instances=1, coalesce=True, misfire_grace_time=2 * config.update_frequency) def _reindex(self): log.debug("indexing the store...") self._last_index_time = datetime.now() seen = set() refs = set([b2u(s) for s in self.objects.keys()]) parts = self.parts.values() for ref in refs: for part in parts: if ref in part['items']: seen.add(ref) ix = self.storage.open_index() lock = ix.lock("reindex") try: log.debug("waiting for index lock") lock.acquire(True) log.debug("got index lock") with ix.writer() as writer: for ref in refs: if ref not in seen: log.debug("removing unseen ref {}".format(ref)) del self.objects[ref] del self.parts[ref] log.debug("updating index") for e in self.objects.values(): info = self._index_prep(entity_simple_info(e)) ref = object_id(e) writer.add_document(object_id=ref, **info) writer.mergetype = CLEAR finally: try: log.debug("releasing index lock") lock.release() except ThreadError as ex: pass def dump(self): ix = self.storage.open_index() from whoosh.query import Every with ix.searcher() as searcher: for result in ix.searcher().search(Every('object_id')): print(result) def _index_prep(self, info): res = dict() if 'entity_attributes' in info: for a, v in list(info.pop('entity_attributes').items()): info[a] = v content = " ".join( filter(lambda x: x is not None, [ info.get(x, '') for x in ('service_name', 'title', 'domain', 'keywords', 'scopes') ])) res['content'] = content.strip() for a, v in info.items(): k = a if a in ATTRS_INV: k = ATTRS_INV[a] if k in self.schema.names(): if type(v) in (list, tuple): res[k] = " ".join([vv.lower() for vv in v]) elif type(v) in six.string_types: res[k] = info[a].lower() res['sha1'] = hash_id(info['entity_id'], prefix=False) return res def update(self, t, tid=None, etag=None, lazy=True): relt = root(t) assert (relt is not None) if relt.tag == "{%s}EntityDescriptor" % NS['md']: ref = object_id(relt) parts = None if ref in self.parts: parts = self.parts[ref] if etag is not None and (parts is None or parts.get('etag', None) != etag): self.parts[ref] = { 'id': relt.get('entityID'), 'etag': etag, 'count': 1, 'items': [ref] } self.objects[ref] = relt self._last_modified = datetime.now() elif relt.tag == "{%s}EntitiesDescriptor" % NS['md']: if tid is None: tid = relt.get('Name') if etag is None: etag = hex_digest(dumptree(t, pretty_print=False), 'sha256') parts = None if tid in self.parts: parts = self.parts[tid] if parts is None or parts.get('etag', None) != etag: items = set() for e in iter_entities(t): ref = object_id(e) items.add(ref) self.objects[ref] = e self.parts[tid] = { 'id': tid, 'count': len(items), 'etag': etag, 'items': list(items) } self._last_modified = datetime.now() if not lazy: self._reindex() @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def collections(self): return [b2u(ref) for ref in self.parts.keys()] def reset(self): for k in ('{}_{}'.format(self._name, 'parts'), '{}_{}'.format(self._name, 'objects')): self._redis.delete('{}_{}'.format(self._name, 'parts')) self._redis.delete('{}_{}'.format(self._name, 'objects')) def size(self, a=None, v=None): if a is None: return len(self.objects.keys()) elif a is not None and v is None: return len(self.attribute(a)) else: return len(self.lookup("{!s}={!s}".format(a, v))) def _attributes(self): ix = self.storage.open_index() with ix.reader() as reader: for n in reader.indexed_field_names(): if n in ATTRS: yield b2u(ATTRS[n]) def attributes(self): return b2u(list(self._attributes())) def attribute(self, a): if a in ATTRS_INV: n = ATTRS_INV[a] ix = self.storage.open_index() with ix.searcher() as searcher: return b2u(list(searcher.lexicon(n))) else: return [] def _prep_key(self, key): # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') key = key.replace('-', ' AND NOT ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() return key def _entities(self): lst = set() for ref_data in self.parts.values(): for ref in ref_data['items']: e = self.objects.get(ref, None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def lookup(self, key): if key == 'entities' or key is None: return self._entities() bkey = six.b(key) if bkey in self.objects: return [self.objects.get(bkey)] if bkey in self.parts: res = [] part = self.parts.get(bkey) for item in part['items']: res.extend(self.lookup(item)) return res key = self._prep_key(key) qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: e = self.objects.get(result['object_id'], None) if e is not None: lst.add(e) return b2u(list(lst)) @ttl_cache(ttl=config.cache_ttl, maxsize=config.cache_size) def search(self, query=None, path=None, entity_filter=None, related=None): if entity_filter: query = "{!s} AND {!s}".format(query, entity_filter) query = self._prep_key(query) qp = MultifieldParser(['content', 'domain'], schema=self.schema) q = qp.parse(query) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) log.debug(results) for result in results: lst.add(result['object_id']) res = list() for ref in lst: e = self.objects.get(ref, None) if e is not None: res.append(discojson(e)) return res
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ("AND", "NOT", "OR", "TO") # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( "\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", ".", ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024) self.path = connection_options.get("PATH") if connection_options.get("STORAGE", "file") != "file": self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias ) self.log = logging.getLogger("haystack") def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema( connections[self.connection_alias].get_unified_index().all_searchfields() ) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost ) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=int, field_boost=field_class.boost ) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, type=float, field_boost=field_class.boost ) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost ) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost ) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( u"%s while preparing object for update" % e.__name__, exc_info=True, extra={"data": {"index": index, "object": get_identifier(obj)}}, ) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
class SearchMigrationTest(TestCase): """Search index migration testing""" def setUp(self): self.path = tempfile.mkdtemp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.path) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): if os.path.exists(self.path): shutil.rmtree(self.path) weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') self.assertIsNotNone( weblate.trans.search.get_source_index() ) self.assertIsNotNone( weblate.trans.search.get_target_index('cs') ) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.path) self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT() ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): self.remove_temp() weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') sindex = weblate.trans.search.get_source_index() self.assertIsNotNone(sindex) tindex = weblate.trans.search.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document( pk=1, target="target", comment="comment" ) writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual( fulltext_search(item, ['cs'], {item: True}), set([1]) ) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT() ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
def create_indexes(cls): storage = FileStorage(index_dir) storage.create_index(SongIndexSchema, indexname="songs")
def create_index(): if not os.path.exists(settings.WHOOSH_INDEX): os.makedirs(settings.WHOOSH_INDEX) storage = FileStorage(settings.WHOOSH_INDEX) storage.create_index(schema=WHOOSH_SCHEMA, indexname="rarog")
class SearchMigrationTest(TestCase, TempDirMixin): """Search index migration testing""" def setUp(self): self.create_temp() self.backup = weblate.trans.search.STORAGE self.storage = FileStorage(self.tempdir) weblate.trans.search.STORAGE = self.storage self.storage.create() def tearDown(self): self.remove_temp() weblate.trans.search.STORAGE = self.backup def do_test(self, source, target): if source is not None: self.storage.create_index(source, 'source') if target is not None: self.storage.create_index(target, 'target-cs') sindex = weblate.trans.search.get_source_index() self.assertIsNotNone(sindex) tindex = weblate.trans.search.get_target_index('cs') self.assertIsNotNone(tindex) writer = sindex.writer() writer.update_document( pk=1, source="source", context="context", location="location", ) writer.commit() writer = tindex.writer() writer.update_document(pk=1, target="target", comment="comment") writer.commit() for item in ('source', 'context', 'location', 'target'): self.assertEqual(fulltext_search(item, ['cs'], {item: True}), set([1])) def test_nonexisting(self): self.do_test(None, None) def test_nonexisting_dir(self): shutil.rmtree(self.tempdir) self.tempdir = None self.do_test(None, None) def test_current(self): source = weblate.trans.search.SourceSchema target = weblate.trans.search.TargetSchema self.do_test(source, target) def test_2_4(self): source = Schema(checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), location=TEXT()) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), comment=TEXT(), ) self.do_test(source, target) def test_2_1(self): source = Schema( checksum=ID(stored=True, unique=True), source=TEXT(), context=TEXT(), ) target = Schema( checksum=ID(stored=True, unique=True), target=TEXT(), ) self.do_test(source, target)
import whoosh, os from whoosh.filedb.filestore import FileStorage schema_commit = whoosh.fields.Schema(repository_id=whoosh.fields.ID(stored=True), commit_id=whoosh.fields.ID(stored=True), author=whoosh.fields.TEXT(stored=True), date=whoosh.fields.DATETIME, message=whoosh.fields.ID(stored=True)) indexdir = "indexdir" storage = FileStorage(indexdir) exists = whoosh.index.exists_in(indexdir) if exists: ix = storage.open_index(indexname="usages") else: if not os.path.exists(indexdir): os.mkdir(indexdir) ix = storage.create_index(schema_commit, indexname="usages")
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured("You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) try: writer.update_document(**doc) except Exception, e: if not self.silently_fail: raise self.log.error("Failed to add documents to Whoosh: %s", e) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if self.include_spelling is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name)
if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) 带开一个已经存在某个目录的索引,使用index.open_dir() [python] view plain copy import whoosh.index as index ix = index.open_dir("indexdir") 这些是便利方法: [python] view plain copy from whoosh.filedb.filestore import FileStorage storage = FileStorage("indexdir") # Create an index ix = storage.create_index(schema) # Open an existing index storage.open_index() 你和index对象一起创建的schema对象是可序列化的并且和index一起存储 你可以在同一个目录下面使用多个索引,用关键字参数分开 [python] view plain copy # Using the convenience functions ix = index.create_in("indexdir", schema=schema, indexname="usages") ix = index.open_dir("indexdir", indexname="usages") # Using the Storage object ix = storage.create_index(schema, indexname="usages") ix = storage.open_index(indexname="usages") Clearing the index
class SearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, site=None): super(SearchBackend, self).__init__(site) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(settings, 'HAYSTACK_WHOOSH_POST_LIMIT', 128 * 1024 * 1024) if getattr(settings, 'HAYSTACK_WHOOSH_STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not hasattr(settings, 'HAYSTACK_WHOOSH_PATH'): raise ImproperlyConfigured('You must specify a HAYSTACK_WHOOSH_PATH in your settings.') def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if self.use_file_storage and not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH) if self.use_file_storage: self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) else: global LOCALS if LOCALS.RAM_STORE is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { 'id': ID(stored=True, unique=True), 'django_ct': ID(stored=True), 'django_id': ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST(stored=True) else: schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float) elif field_class.field_type == 'boolean': schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored) else: schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer()) if field_class.document is True: content_field_name = field_class.index_fieldname # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.") return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: doc = index.full_prepare(obj) # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) writer.update_document(**doc) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() # If spelling support is desired, add to the dictionary. if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True: sp = SpellChecker(self.storage) sp.add_field(self.index, self.content_field_name) def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) self.index.delete_by_query(q=self.parser.parse(u'id:"%s"' % whoosh_id)) def clear(self, models=[], commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if not models: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name)) self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete))) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(settings.HAYSTACK_WHOOSH_PATH): shutil.rmtree(settings.HAYSTACK_WHOOSH_PATH) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, limit_to_registered_models=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_unicode(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if len(sort_by) > 1 and reverse_counter > 1: raise SearchBackendError("Whoosh does not handle more than one field and any field being ordered in reverse.") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list[0] if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if limit_to_registered_models: # Using narrow queries, limit the results to only models registered # with the current site. if narrow_queries is None: narrow_queries = set() registered_models = self.build_registered_models_list() if len(registered_models) > 0: narrow_queries.add('django_ct:(%s)' % ' OR '.join(registered_models)) if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_unicode(nq))) if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if not end_offset is None and end_offset <= 0: end_offset = 1 raw_results = searcher.search(parsed_query, limit=end_offset, sortedby=sort_by, reverse=reverse) # Handle the case where the results have been narrowed. if narrowed_results: raw_results.filter(narrowed_results) # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = start_offset / page_length # Increment because Whoosh uses 1-based page numbers. page_num += 1 try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } return self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query) else: if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, limit_to_registered_models=None, **kwargs): warnings.warn("Whoosh does not handle More Like This.", Warning, stacklevel=2) return { 'results': [], 'hits': 0, } def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None): from haystack import site results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) facets = {} spelling_suggestion = None indexed_models = site.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result['django_ct'].split('.') additional_fields = {} model = get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = site.get_index(model) string_key = str(key) if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if isinstance(index.fields[string_key], MultiValueField): if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(',') else: additional_fields[string_key] = index.fields[string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del(additional_fields['django_ct']) del(additional_fields['django_id']) if highlight: from whoosh import analysis from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter sa = analysis.StemmingAnalyzer() terms = [term.replace('*', '') for term in query_string.split()] additional_fields['highlighted'] = { self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())], } result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields) results.append(result) else: hits -= 1 if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False): if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None sp = SpellChecker(self.storage) cleaned_query = force_unicode(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = sp.suggest(word, number=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = True else: value = False elif isinstance(value, (list, tuple)): value = u','.join([force_unicode(v) for v in value]) elif isinstance(value, (int, long, float)): # Leave it alone. pass else: value = force_unicode(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, basestring): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance(converted_value, (list, tuple, set, dict, int, float, long, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def setup_index(): storage = FileStorage(data_dir('memory')) storage.create() return storage.create_index(TMSchema())
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, 'RAM_STORE', None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True) def clear(self, models=None, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query( q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Whoosh index of models '%s': %s", ','.join(models_to_delete), e, exc_info=True) else: self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if end_offset is not None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page( start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) is 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
def create_index(**kwargs): if not os.path.exists(SEARCH_INDEX): os.mkdir(SEARCH_INDEX) storage = FileStorage(SEARCH_INDEX) storage.create_index(SEARCH_SCHEMA)