コード例 #1
0
class IndexConfig:
    def __init__(self, config_dict):
        self.__index_config_dict = config_dict

        self.__schema = Schema()

        try:
            for field_name in self.__index_config_dict['schema'].keys():
                field_type = self.__get_field_type(
                    self.__index_config_dict['schema'][field_name]
                    ['field_type'])
                for arg in self.__index_config_dict['schema'][field_name][
                        'args'].keys():
                    setattr(
                        field_type, arg, self.__index_config_dict['schema']
                        [field_name]['args'][arg])
                self.__schema.add(field_name, field_type, glob=False)

            if not self.__validate():
                raise ValueError('invalid schema')
        except Exception as ex:
            raise ex

    def __get_filter(self, name):
        class_name = self.__index_config_dict['filters'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['filters'][name]:
            class_args = deepcopy(
                self.__index_config_dict['filters'][name]['args'])

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_tokenizer(self, name):
        class_name = self.__index_config_dict['tokenizers'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['tokenizers'][name]:
            class_args = deepcopy(
                self.__index_config_dict['tokenizers'][name]['args'])

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_analyzer(self, name):
        instance = None

        if 'class' in self.__index_config_dict['analyzers'][name]:
            class_name = self.__index_config_dict['analyzers'][name]['class']
            class_args = {}
            if 'args' in self.__index_config_dict['analyzers'][name]:
                class_args = deepcopy(
                    self.__index_config_dict['analyzers'][name]['args'])

            instance = get_instance(class_name, **class_args)
        elif 'tokenizer' in self.__index_config_dict['analyzers'][name]:
            instance = self.__get_tokenizer(
                self.__index_config_dict['analyzers'][name]['tokenizer'])
            if 'filters' in self.__index_config_dict['analyzers'][name]:
                for filter_name in self.__index_config_dict['analyzers'][name][
                        'filters']:
                    instance = instance | self.__get_filter(filter_name)

        return instance

    def __get_field_type(self, name):
        class_name = self.__index_config_dict['field_types'][name]['class']
        class_args = {}
        if 'args' in self.__index_config_dict['field_types'][name]:
            class_args = deepcopy(
                self.__index_config_dict['field_types'][name]['args'])
            if 'analyzer' in class_args:
                class_args['analyzer'] = self.__get_analyzer(
                    class_args['analyzer']) if class_args['analyzer'] else None
            if 'tokenizer' in class_args:
                class_args['tokenizer'] = self.__get_tokenizer(
                    class_args['tokenizer']
                ) if class_args['tokenizer'] else None

        instance = get_instance(class_name, **class_args)

        return instance

    def __get_unique_fields(self):
        return [name for name, field in self.__schema.items() if field.unique]

    def __validate(self):
        valid = False

        if len(self.__get_unique_fields()) == 1:
            valid = True

        return valid

    def get_schema(self):
        return self.__schema

    def get_doc_id_field(self):
        return self.__get_unique_fields()[0]

    def get_storage_type(self):
        try:
            storage_type = self.__index_config_dict['storage']['type']
        except KeyError:
            storage_type = 'file'

        return storage_type

    def get_writer_processors(self):
        try:
            procs = self.__index_config_dict['writer']['processors']
        except KeyError:
            procs = 1

        return procs

    def get_writer_batch_size(self):
        try:
            batch_size = self.__index_config_dict['writer']['batch_size']
        except KeyError:
            batch_size = 100

        return batch_size

    def get_writer_multi_segment(self):
        try:
            multi_segment = self.__index_config_dict['writer']['multi_segment']
        except KeyError:
            multi_segment = False

        return multi_segment

    def get_writer_auto_commit_period(self):
        try:
            period = self.__index_config_dict['writer']['auto_commit'][
                'period']
        except KeyError:
            period = 0
        return period

    def get_writer_auto_commit_limit(self):
        try:
            limit = self.__index_config_dict['writer']['auto_commit']['limit']
        except KeyError:
            limit = 10
        return limit
コード例 #2
0
ファイル: l1_hello-world.py プロジェクト: HCShi/jShellscript
print(results[0])  # <Hit {'title': 'hello'}>; 每页显示一个结果, 第 1 页

##################################################################
## 1. 创建 schema
schema = Schema(title=TEXT(stored=True), path=ID(stored=True),
                content=TEXT)  # stored 为 True 表示能够被检索
# All keyword arguments to the constructor are treated as fieldname = fieldtype pairs.
# The fieldtype can be an instantiated FieldType object, or a FieldType sub-class
#     (in which case the Schema will instantiate it with the default constructor before adding it).
# For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True))
# 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True
from whoosh import fields
# 打印支持的变量类型
print([item for item in dir(fields)[:10] if item.isupper()
       ])  # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD']
print(len(schema.items()))  # 3
print(
    schema.items()[0]
)  # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None))
print(
    schema.items()[1]
)  # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False))
print(
    schema.items()[2]
)  # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None))
print(
    schema.names()
)  # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema.
print(
    schema.scorable_names()
)  # ['content', 'title']; Returns a list of the names of fields that store field lengths.
コード例 #3
0
## search_page(query, pagenum, pagelen=10, **kwargs)
results = searcher.search_page(myquery, 2, 1); print(results[0])  # <Hit {'title': 'world'}>; 每页显示一个结果, 第 2 页
results = searcher.search_page(myquery, 1, 1); print(results[0])  # <Hit {'title': 'hello'}>; 每页显示一个结果, 第 1 页

##################################################################
## 1. 创建 schema
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)  # stored 为 True 表示能够被检索
# All keyword arguments to the constructor are treated as fieldname = fieldtype pairs.
# The fieldtype can be an instantiated FieldType object, or a FieldType sub-class
#     (in which case the Schema will instantiate it with the default constructor before adding it).
# For example: s = Schema(content=TEXT, title=TEXT(stored = True), tags=KEYWORD(stored = True))
# 返回索引结果的时候一般只想得到文章标题和路径, 文章内容是想要点进去看; 所以 content 没有 stored=True
from whoosh import fields
# 打印支持的变量类型
print([item for item in dir(fields)[:10] if item.isupper()])  # ['BOOLEAN', 'COLUMN', 'DATETIME', 'ID', 'IDLIST', 'KEYWORD']
print(len(schema.items()))  # 3
print(schema.items()[0])  # ('content', TEXT(format=Positions(boost=1.0), scorable=True, stored=False, unique=None))
print(schema.items()[1])  # ('path', ID(format=Existence(boost=1.0), scorable=None, stored=True, unique=False))
print(schema.items()[2])  # ('title', TEXT(format=Positions(boost=1.0), scorable=True, stored=True, unique=None))
print(schema.names())  # ['content', 'path', 'title']; Returns a list of the names of the fields in this schema.
print(schema.scorable_names())  # ['content', 'title']; Returns a list of the names of fields that store field lengths.
print(schema.stored_names())  # ['path', 'title']; Returns a list of the names of fields that are stored.
print(schema.has_scorable_fields())  # True
##################################################################
## 2. 索引生成
## create_in(dirname, schema, indexname=None)
## Convenience function to create an index in a directory. Takes care of creating a FileStorage object for you.
ix = create_in('./tmp', schema)  # 存储 schema 信息至 ./tmp/; ** 这个只能执行一遍, 否则会报 LockError **
print(type(ix))  # <class 'whoosh.index.FileIndex'>
print(ix.schema)  # <Schema: ['content', 'path', 'title']>
## writer(procs=1, **kwargs): Returns an IndexWriter object for this index.