class RecipeSchema(SchemaClass): title = TEXT(stored=True, multitoken_query='or', analyzer=StandardAnalyzer() | StemFilter() | CustomFilter()) directions = TEXT(stored=True, multitoken_query='or', analyzer=StandardAnalyzer() | StemFilter() | CustomFilter()) ingredients = TEXT(stored=True, multitoken_query='or', analyzer=StandardAnalyzer() | StemFilter() | CustomFilter()) categories = KEYWORD(stored=True, commas=True) calories = NUMERIC(stored=True)
def GensimAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000): return GensimTokenizer()|LowercaseFilter()\ |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)\ |StopFilter(stoplist=stoplist,minsize=minsize)
def ChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000): return (ChineseTokenizer() | LowercaseFilter() | StopFilter(stoplist=stoplist, minsize=minsize) | StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
def create_analyzer(): conf = config.get_config() if conf['STOPWORDS']: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | StopFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StopFilter() else: if conf['CHARACTERS_FOLDING']: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | CharsetFilter(accent_map) \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | CharsetFilter(accent_map) else: if conf['STEMMING']: if conf['QGRAMS']: analyzer = RegexTokenizer() | StemFilter() \ | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() | StemFilter() else: if conf['QGRAMS']: analyzer = RegexTokenizer() | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX']) else: analyzer = RegexTokenizer() log.print_debug(TAG, "Analizzatore creato") return analyzer
def CleanupStemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False, stemfn=stem, ignore=None, cachesize=50000): ret = RegexTokenizer(expression=expression, gaps=gaps) # added CleanupFilter here chain = ret | CleanupFilter() | LowercaseFilter() if stoplist is not None: chain = chain | StopFilter( stoplist=stoplist, minsize=minsize, maxsize=maxsize) return chain | StemFilter( stemfn=stemfn, ignore=ignore, cachesize=cachesize)
from whoosh.index import _CURRENT_TOC_VERSION as whoosh_ix_ver except ImportError: from whoosh.filedb.fileindex import _INDEX_VERSION as whoosh_ix_ver from stemming import stemArabic def stemfn(word): return stemArabic(stem(word)) # word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]" analyzer = StandardAnalyzer( expression= ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*" ) | StemFilter(stemfn) from whoosh.qparser import FieldAliasPlugin from whooshSymbolicQParser import MultifieldSQParser class ExcerptFormatter(object): def __init__(self, between="..."): self.between = between def _format_fragment(self, text, fragment): output = [] index = fragment.startchar for t in fragment.matches: if t.startchar > index:
def search(request): data = request.GET category_id = int(data.get('category_id', 0)) order = int(data.get('order', ORDER_BY_MOST_RECENT)) search_text = data.get('search_text', '').lower() tesis_services = TesisServices() total_full = list() tutors_full = list() all_full = tesis_services.get_by_category(category_id, order) if len(search_text) > 0: total_full, tutors_full = TesisServices.search_in_tesis( search_text, all_full) # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1 # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming, # luego se realiza la extracción de keywords o tokens """ “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default list of common stop words. Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, most of the time) at the base word. """ if len(search_text.split()) > 1: analyzer = LanguageAnalyzer("es") a_filters = StopFilter() | StemFilter() keywords = list( set([ token.text for token in a_filters( analyzer(search_text, no_morph=True)) ])) else: keywords = [search_text] for word in keywords: obj, created = Searches.objects.update_or_create(word=word) if not created: obj.count += 1 else: if obj.count is None: obj.count = 1 obj.save() else: total_full = all_full # Se actualiza las palabras más buscadas # Se actualiza total de búsquedas y cantidad de palabras diferentes searches_services = SearchesServices() searches_services.generate_resume() top_words_searched = searches_services.top_words_searched # Total de palabras diferentes total_words = searches_services.total_words # Total de busquedas en el sitio total_searchs = searches_services.total_searchs # Paginado de lista de tesis paginator = Paginator(total_full, 5) page = request.GET.get('page') tesis_list = paginator.get_page(page) the_data = { 'tesis_list': render_to_string('sections/central_published_tesis.html', { 'tesis_list': tesis_list, 'question': search_text }), # serializers.serialize("json", [x for x in total_full]), 'tutors_list': tutors_full, 'top_words_searched': top_words_searched, 'total_words': total_words, 'total_searchs': total_searchs, 'question': search_text } # the_data = serializers.serialize("json", [x for x in total_full]) return JsonResponse(the_data)
def __init__(self, index_path, names=None): self._analyzer = SpaceSeparatedTokenizer() | LowercaseFilter() | StopFilter(minsize=1, stoplist=stoplist) | StemFilter() if index.exists_in(index_path): self._ix = index.open_dir(index_path) else: self.build_index(index_path, names) self._qp = QueryParser("title", self._ix.schema, plugins=[])
from whoosh.fields import TEXT, ID, KEYWORD, Schema from whoosh.query import Term, Phrase, Or, And from whoosh.analysis import SpaceSeparatedTokenizer, StopFilter, \ LowercaseFilter, StemFilter import whoosh.scoring from DAS.keywordsearch.config import DEBUG as _DEBUG INDEX_DIR = os.environ.get('DAS_KWS_IR_INDEX', '/tmp/das') # Helper functions for manipulating keyword lists tokenize = SpaceSeparatedTokenizer() remove_stopwords = StopFilter() tolower = LowercaseFilter() stemmer = StemFilter() def kwlist_no_stopwords(kwds): """ filter the keywords: remove stopwords, lower and tokenize """ return [t.text for t in remove_stopwords(tolower(tokenize(kwds)))] def kwlist_stemmed(kwds): """ filter the keywords: STEM, remove stopwords, lower and tokenize """ return [t.text for t in stemmer(remove_stopwords(tolower(tokenize(kwds))))] class SimpleIREntityAttributeMatcher(object): """ Each instance of matcher recreates the index.
import json import logging from whoosh.analysis import FancyAnalyzer, StemFilter from whoosh.fields import Schema, ID, TEXT from whoosh.index import create_in, open_dir, exists_in from whoosh.qparser import QueryParser from whoosh.query import Query, Variations DEFAULT_SCHEMA = Schema( id=ID(unique=True, stored=True), data=ID(stored=True), content=TEXT(FancyAnalyzer() | StemFilter()), ) logger = logging.getLogger(__name__) class SearchIndex(object): def __init__(self, index_dir, schema=DEFAULT_SCHEMA, force_create=False): self.schema = schema if exists_in(index_dir) and not force_create: self.index = open_dir(index_dir, schema=schema) else: self.index = create_in(index_dir, schema=schema) @classmethod def is_empty(cls, index_dir): si = SearchIndex(index_dir) return si.index.is_empty()