Python NGRAM Examples, whoosh.fields.NGRAM Python Examples

Example #1

0

Show file

import os

from whoosh import index
from whoosh.fields import Schema, ID, TEXT, NGRAM

# 인덱스 데이터를 저장할 디렉터리 지정하기
INDEX_DIR = "indexdir"

# 인덱스 전용 스키마 정의하기
schema = Schema(
    # 인덱스 유닛 ID로 글의 URL 사용하기
    post_url=ID(unique=True, stored=True),
    # 본문을 N그램으로 인덱스화
    body=NGRAM(stored=True),
)

def get_or_create_index():
    # 인덱스 전용 디렉터리가 없다면 만들기
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
        # 인덱스 전용 파일 만들기
        ix = index.create_in(INDEX_DIR, schema)
        return ix
        
    # 이미 인덱스 전용 디렉터리가 있는 경우
    # 기존의 인덱스 파일 열어서 사용하기
    ix = index.open_dir(INDEX_DIR)
    return ix

Example #2

0

Show file

File: whoosh_backend.py Project: puzzlet/django-haystack

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for _, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost,
                    )
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=StemmingAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True,
                )

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

Example #3

0

Show file

class GroupSchema(SchemaClass):

    pk = ID(stored=True, unique=True)
    name = TEXT(stored=True, spelling=True)
    content = NGRAM(minsize=1, phrase=True)

Example #4

0

Show file

File: whoosh_cn_backend.py Project: huaxinwu/blogproject

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # 但在 django haystack 中为 Whoosh 指定的分词器是英文分词器，可能会使得搜索结果不理想，我们把这个分词器替换成 jieba 中文分词器
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

Example #5

0

Show file

File: register.py Project: shirou/whooshfdw

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os

from whoosh import index
from whoosh.fields import Schema, STORED, NGRAM
from whoosh.qparser import QueryParser
from whoosh.analysis import NgramAnalyzer

INDEX_DIR = "/tmp/indexdir"
schema = Schema(title=NGRAM(stored=True))


def open_index(indexdir):
    if not os.path.exists(indexdir):
        os.mkdir(indexdir)
        index.create_in(indexdir, schema)

    return index.open_dir(indexdir)


def register(filename, indexdir=INDEX_DIR):
    ix = open_index(indexdir)
    writer = ix.writer()

    for line in open(filename):
        writer.add_document(title=line.strip().decode('utf-8'))

    writer.commit(optimize=True)
    ix.close()

Example #6

0

Show file

File: __init__.py Project: shirou/sphinx-websupport-demo-for-heroku

import cPickle as pickle

ROOT = os.path.dirname(os.path.abspath(__file__))
SRCDIR = os.path.join(ROOT, 'source')
BUILDDIR = os.path.join(ROOT, 'build', 'web')
INDEXDIR = os.path.join(BUILDDIR, "data", "db")

print("SRC:{0}, BUILD:{1}, INDEX:{2}".format(SRCDIR, BUILDDIR, INDEXDIR))

uri = os.environ.get('DATABASE_URL')  # DATABSE_URL is given
storage = SQLAlchemyStorage(uri)

whoosh = whooshsearch.WhooshSearch
whoosh.schema = Schema(path=ID(stored=True, unique=True),
                       title=TEXT(field_boost=2.0, stored=True),
                       text=NGRAM(stored=True))
search = whoosh(INDEXDIR)

support = WebSupport(srcdir=SRCDIR,
                     builddir=BUILDDIR,
                     search=search,
                     storage=storage)

#### flask part

from flask import Flask, render_template, abort, g, request, jsonify, url_for
from jinja2 import Environment, FileSystemLoader

app = Flask(__name__)

#app.debug = True #

Example #7

0

Show file

    all_fields = ['info', 'value', 'comment', 'tags']
    # If field is None, search in all
    if not fields:
        search_fields = all_fields
    elif isinstance(fields, list):
        for f in fields:
            if f not in all_fields:
                raise Exception('Invalid Fieldname')
        search_fields = fields
    else:
        search_fields = [fields]
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = open_dir("indexdir")
    mparser = MultifieldParser(search_fields, schema=ix.schema, group=OrGroup)
    with ix.searcher() as searcher:
        q = mparser.parse(query)
        responses = searcher.search(q, limit=None)
        return Counter([r['eid'] for r in responses])


if __name__ == '__main__':
    from connector import SnapshotConnector
    connector = SnapshotConnector()
    schema = Schema(eid=ID(stored=True),
                    info=NGRAM(minsize=5, phrase=True),
                    value=KEYWORD(lowercase=True),
                    comment=NGRAM(minsize=5, phrase=True),
                    tags=KEYWORD(lowercase=True))
    index_all(connector, schema)