import os from whoosh import index from whoosh.fields import Schema, ID, TEXT, NGRAM # 인덱스 데이터를 저장할 디렉터리 지정하기 INDEX_DIR = "indexdir" # 인덱스 전용 스키마 정의하기 schema = Schema( # 인덱스 유닛 ID로 글의 URL 사용하기 post_url=ID(unique=True, stored=True), # 본문을 N그램으로 인덱스화 body=NGRAM(stored=True), ) def get_or_create_index(): # 인덱스 전용 디렉터리가 없다면 만들기 if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) # 인덱스 전용 파일 만들기 ix = index.create_in(INDEX_DIR, schema) return ix # 이미 인덱스 전용 디렉터리가 있는 경우 # 기존의 인덱스 파일 열어서 사용하기 ix = index.open_dir(INDEX_DIR) return ix
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for _, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost, ) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost, ) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost, ) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost, ) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True, ) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
class GroupSchema(SchemaClass): pk = ID(stored=True, unique=True) name = TEXT(stored=True, spelling=True) content = NGRAM(minsize=1, phrase=True)
def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: # 但在 django haystack 中为 Whoosh 指定的分词器是英文分词器,可能会使得搜索结果不理想,我们把这个分词器替换成 jieba 中文分词器 schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields))
#!/usr/bin/env python # -*- coding: utf-8 -*- import os from whoosh import index from whoosh.fields import Schema, STORED, NGRAM from whoosh.qparser import QueryParser from whoosh.analysis import NgramAnalyzer INDEX_DIR = "/tmp/indexdir" schema = Schema(title=NGRAM(stored=True)) def open_index(indexdir): if not os.path.exists(indexdir): os.mkdir(indexdir) index.create_in(indexdir, schema) return index.open_dir(indexdir) def register(filename, indexdir=INDEX_DIR): ix = open_index(indexdir) writer = ix.writer() for line in open(filename): writer.add_document(title=line.strip().decode('utf-8')) writer.commit(optimize=True) ix.close()
import cPickle as pickle ROOT = os.path.dirname(os.path.abspath(__file__)) SRCDIR = os.path.join(ROOT, 'source') BUILDDIR = os.path.join(ROOT, 'build', 'web') INDEXDIR = os.path.join(BUILDDIR, "data", "db") print("SRC:{0}, BUILD:{1}, INDEX:{2}".format(SRCDIR, BUILDDIR, INDEXDIR)) uri = os.environ.get('DATABASE_URL') # DATABSE_URL is given storage = SQLAlchemyStorage(uri) whoosh = whooshsearch.WhooshSearch whoosh.schema = Schema(path=ID(stored=True, unique=True), title=TEXT(field_boost=2.0, stored=True), text=NGRAM(stored=True)) search = whoosh(INDEXDIR) support = WebSupport(srcdir=SRCDIR, builddir=BUILDDIR, search=search, storage=storage) #### flask part from flask import Flask, render_template, abort, g, request, jsonify, url_for from jinja2 import Environment, FileSystemLoader app = Flask(__name__) #app.debug = True #
all_fields = ['info', 'value', 'comment', 'tags'] # If field is None, search in all if not fields: search_fields = all_fields elif isinstance(fields, list): for f in fields: if f not in all_fields: raise Exception('Invalid Fieldname') search_fields = fields else: search_fields = [fields] if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = open_dir("indexdir") mparser = MultifieldParser(search_fields, schema=ix.schema, group=OrGroup) with ix.searcher() as searcher: q = mparser.parse(query) responses = searcher.search(q, limit=None) return Counter([r['eid'] for r in responses]) if __name__ == '__main__': from connector import SnapshotConnector connector = SnapshotConnector() schema = Schema(eid=ID(stored=True), info=NGRAM(minsize=5, phrase=True), value=KEYWORD(lowercase=True), comment=NGRAM(minsize=5, phrase=True), tags=KEYWORD(lowercase=True)) index_all(connector, schema)