class IndexConfig: def __init__(self, config_dict): self.__index_config_dict = config_dict self.__schema = Schema() try: for field_name in self.__index_config_dict['schema'].keys(): field_type = self.__get_field_type( self.__index_config_dict['schema'][field_name] ['field_type']) for arg in self.__index_config_dict['schema'][field_name][ 'args'].keys(): setattr( field_type, arg, self.__index_config_dict['schema'] [field_name]['args'][arg]) self.__schema.add(field_name, field_type, glob=False) if not self.__validate(): raise ValueError('invalid schema') except Exception as ex: raise ex def __get_filter(self, name): class_name = self.__index_config_dict['filters'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['filters'][name]: class_args = deepcopy( self.__index_config_dict['filters'][name]['args']) instance = get_instance(class_name, **class_args) return instance def __get_tokenizer(self, name): class_name = self.__index_config_dict['tokenizers'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['tokenizers'][name]: class_args = deepcopy( self.__index_config_dict['tokenizers'][name]['args']) instance = get_instance(class_name, **class_args) return instance def __get_analyzer(self, name): instance = None if 'class' in self.__index_config_dict['analyzers'][name]: class_name = self.__index_config_dict['analyzers'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['analyzers'][name]: class_args = deepcopy( self.__index_config_dict['analyzers'][name]['args']) instance = get_instance(class_name, **class_args) elif 'tokenizer' in self.__index_config_dict['analyzers'][name]: instance = self.__get_tokenizer( self.__index_config_dict['analyzers'][name]['tokenizer']) if 'filters' in self.__index_config_dict['analyzers'][name]: for filter_name in self.__index_config_dict['analyzers'][name][ 'filters']: instance = instance | self.__get_filter(filter_name) return instance def __get_field_type(self, name): class_name = self.__index_config_dict['field_types'][name]['class'] class_args = {} if 'args' in self.__index_config_dict['field_types'][name]: class_args = deepcopy( self.__index_config_dict['field_types'][name]['args']) if 'analyzer' in class_args: class_args['analyzer'] = self.__get_analyzer( class_args['analyzer']) if class_args['analyzer'] else None if 'tokenizer' in class_args: class_args['tokenizer'] = self.__get_tokenizer( class_args['tokenizer'] ) if class_args['tokenizer'] else None instance = get_instance(class_name, **class_args) return instance def __get_unique_fields(self): return [name for name, field in self.__schema.items() if field.unique] def __validate(self): valid = False if len(self.__get_unique_fields()) == 1: valid = True return valid def get_schema(self): return self.__schema def get_doc_id_field(self): return self.__get_unique_fields()[0] def get_storage_type(self): try: storage_type = self.__index_config_dict['storage']['type'] except KeyError: storage_type = 'file' return storage_type def get_writer_processors(self): try: procs = self.__index_config_dict['writer']['processors'] except KeyError: procs = 1 return procs def get_writer_batch_size(self): try: batch_size = self.__index_config_dict['writer']['batch_size'] except KeyError: batch_size = 100 return batch_size def get_writer_multi_segment(self): try: multi_segment = self.__index_config_dict['writer']['multi_segment'] except KeyError: multi_segment = False return multi_segment def get_writer_auto_commit_period(self): try: period = self.__index_config_dict['writer']['auto_commit'][ 'period'] except KeyError: period = 0 return period def get_writer_auto_commit_limit(self): try: limit = self.__index_config_dict['writer']['auto_commit']['limit'] except KeyError: limit = 10 return limit
print(results[0]) # <Hit {'title': 'hello'}>; 每页显示一个结果, 第 1 页 ################################################################## ## 1. 创建 schema schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) # stored 为 True 表示能够被检索 # All keyword arguments to the constructor are treated as fieldname = fieldtype pairs. # The fieldtype can be an instantiated FieldType object, or a FieldType sub-class # (in which case the Schema will instantiate it with the default constructor before adding it). # For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True)) # 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True from whoosh import fields # 打印支持的变量类型 print([item for item in dir(fields)[:10] if item.isupper() ]) # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD'] print(len(schema.items())) # 3 print( schema.items()[0] ) # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None)) print( schema.items()[1] ) # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False)) print( schema.items()[2] ) # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None)) print( schema.names() ) # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema. print( schema.scorable_names() ) # ['content', 'title']; Returns a list of the names of fields that store field lengths.
## search_page(query, pagenum, pagelen=10, **kwargs) results = searcher.search_page(myquery, 2, 1); print(results[0]) # <Hit {'title': 'world'}>; 每页显示一个结果, 第 2 页 results = searcher.search_page(myquery, 1, 1); print(results[0]) # <Hit {'title': 'hello'}>; 每页显示一个结果, 第 1 页 ################################################################## ## 1. 创建 schema schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) # stored 为 True 表示能够被检索 # All keyword arguments to the constructor are treated as fieldname = fieldtype pairs. # The fieldtype can be an instantiated FieldType object, or a FieldType sub-class # (in which case the Schema will instantiate it with the default constructor before adding it). # For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True)) # 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True from whoosh import fields # 打印支持的变量类型 print([item for item in dir(fields)[:10] if item.isupper()]) # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD'] print(len(schema.items())) # 3 print(schema.items()[0]) # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None)) print(schema.items()[1]) # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False)) print(schema.items()[2]) # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None)) print(schema.names()) # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema. print(schema.scorable_names()) # ['content', 'title']; Returns a list of the names of fields that store field lengths. print(schema.stored_names()) # ['path', 'title']; Returns a list of the names of fields that are stored. print(schema.has_scorable_fields()) # True ################################################################## ## 2. 索引生成 ## create_in(dirname, schema, indexname=None) ## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you. ix = create_in('./tmp', schema) # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError ** print(type(ix)) # <class 'whoosh.index.FileIndex'> print(ix.schema) # <Schema: ['content', 'path', 'title']> ## writer(procs=1, **kwargs): Returns an IndexWriter object for this index.