Exemple #1
0
class RecipeSchema(SchemaClass):
    title = TEXT(stored=True,
                 multitoken_query='or',
                 analyzer=StandardAnalyzer() | StemFilter() | CustomFilter())
    directions = TEXT(stored=True,
                      multitoken_query='or',
                      analyzer=StandardAnalyzer() | StemFilter()
                      | CustomFilter())
    ingredients = TEXT(stored=True,
                       multitoken_query='or',
                       analyzer=StandardAnalyzer() | StemFilter()
                       | CustomFilter())
    categories = KEYWORD(stored=True, commas=True)
    calories = NUMERIC(stored=True)
Exemple #2
0
def GensimAnalyzer(stoplist=STOP_WORDS,
                   minsize=1,
                   stemfn=stem,
                   cachesize=50000):
    return GensimTokenizer()|LowercaseFilter()\
            |StemFilter(stemfn=stemfn, ignore=None,cachesize=cachesize)\
            |StopFilter(stoplist=stoplist,minsize=minsize)
Exemple #3
0
def ChineseAnalyzer(stoplist=STOP_WORDS,
                    minsize=1,
                    stemfn=stem,
                    cachesize=50000):
    return (ChineseTokenizer() | LowercaseFilter()
            | StopFilter(stoplist=stoplist, minsize=minsize)
            | StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))
Exemple #4
0
def create_analyzer():
    conf = config.get_config()
    if conf['STOPWORDS']:
        if conf['CHARACTERS_FOLDING']:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map) \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | CharsetFilter(accent_map)
        else:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter() | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StopFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StopFilter()
    else:
        if conf['CHARACTERS_FOLDING']:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map) \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | CharsetFilter(accent_map)
        else:
            if conf['STEMMING']:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | StemFilter() \
                               | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer() | StemFilter()
            else:
                if conf['QGRAMS']:
                    analyzer = RegexTokenizer() | NgramFilter(minsize=conf['QNUM_MIN'], maxsize=conf['QNUM_MAX'])
                else:
                    analyzer = RegexTokenizer()
    log.print_debug(TAG, "Analizzatore creato")
    return analyzer
Exemple #5
0
def CleanupStemmingAnalyzer(expression=default_pattern,
                            stoplist=STOP_WORDS,
                            minsize=2,
                            maxsize=None,
                            gaps=False,
                            stemfn=stem,
                            ignore=None,
                            cachesize=50000):

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    # added CleanupFilter here
    chain = ret | CleanupFilter() | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(
            stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(
        stemfn=stemfn, ignore=ignore, cachesize=cachesize)
    from whoosh.index import _CURRENT_TOC_VERSION as whoosh_ix_ver
except ImportError:
    from whoosh.filedb.fileindex import _INDEX_VERSION as whoosh_ix_ver

from stemming import stemArabic


def stemfn(word):
    return stemArabic(stem(word))


# word_re = ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]"
analyzer = StandardAnalyzer(
    expression=
    ur"[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+(?:\.?[\w\u064e\u064b\u064f\u064c\u0650\u064d\u0652\u0651\u0640]+)*"
) | StemFilter(stemfn)

from whoosh.qparser import FieldAliasPlugin
from whooshSymbolicQParser import MultifieldSQParser


class ExcerptFormatter(object):
    def __init__(self, between="..."):
        self.between = between

    def _format_fragment(self, text, fragment):
        output = []
        index = fragment.startchar

        for t in fragment.matches:
            if t.startchar > index:
Exemple #7
0
def search(request):
    data = request.GET
    category_id = int(data.get('category_id', 0))
    order = int(data.get('order', ORDER_BY_MOST_RECENT))
    search_text = data.get('search_text', '').lower()
    tesis_services = TesisServices()
    total_full = list()
    tutors_full = list()
    all_full = tesis_services.get_by_category(category_id, order)

    if len(search_text) > 0:
        total_full, tutors_full = TesisServices.search_in_tesis(
            search_text, all_full)

        # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1
        # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming,
        # luego se realiza la extracción de keywords o tokens
        """
        “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, 
        “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default 
        list of common stop words.
        Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, 
        most of the time) at the base word.
        """
        if len(search_text.split()) > 1:
            analyzer = LanguageAnalyzer("es")
            a_filters = StopFilter() | StemFilter()
            keywords = list(
                set([
                    token.text for token in a_filters(
                        analyzer(search_text, no_morph=True))
                ]))
        else:
            keywords = [search_text]

        for word in keywords:
            obj, created = Searches.objects.update_or_create(word=word)
            if not created:
                obj.count += 1
            else:
                if obj.count is None:
                    obj.count = 1
            obj.save()
    else:
        total_full = all_full

    # Se actualiza las palabras más buscadas
    # Se actualiza total de búsquedas y cantidad de palabras diferentes
    searches_services = SearchesServices()
    searches_services.generate_resume()
    top_words_searched = searches_services.top_words_searched
    # Total de palabras diferentes
    total_words = searches_services.total_words
    # Total de busquedas en el sitio
    total_searchs = searches_services.total_searchs

    # Paginado de lista de tesis
    paginator = Paginator(total_full, 5)
    page = request.GET.get('page')
    tesis_list = paginator.get_page(page)
    the_data = {
        'tesis_list':
        render_to_string('sections/central_published_tesis.html', {
            'tesis_list': tesis_list,
            'question': search_text
        }),
        # serializers.serialize("json", [x for x in total_full]),
        'tutors_list':
        tutors_full,
        'top_words_searched':
        top_words_searched,
        'total_words':
        total_words,
        'total_searchs':
        total_searchs,
        'question':
        search_text
    }
    # the_data = serializers.serialize("json", [x for x in total_full])
    return JsonResponse(the_data)
Exemple #8
0
 def __init__(self, index_path, names=None):
     self._analyzer = SpaceSeparatedTokenizer() | LowercaseFilter() | StopFilter(minsize=1,
                                                                                 stoplist=stoplist) | StemFilter()
     if index.exists_in(index_path):
         self._ix = index.open_dir(index_path)
     else:
         self.build_index(index_path, names)
     self._qp = QueryParser("title", self._ix.schema, plugins=[])
Exemple #9
0
from whoosh.fields import TEXT, ID, KEYWORD, Schema
from whoosh.query import Term, Phrase, Or, And
from whoosh.analysis import SpaceSeparatedTokenizer, StopFilter, \
    LowercaseFilter, StemFilter
import whoosh.scoring

from DAS.keywordsearch.config import DEBUG as _DEBUG


INDEX_DIR = os.environ.get('DAS_KWS_IR_INDEX', '/tmp/das')

# Helper functions for manipulating keyword lists
tokenize = SpaceSeparatedTokenizer()
remove_stopwords = StopFilter()
tolower = LowercaseFilter()
stemmer = StemFilter()


def kwlist_no_stopwords(kwds):
    """ filter the keywords: remove stopwords, lower and tokenize """
    return [t.text for t in remove_stopwords(tolower(tokenize(kwds)))]


def kwlist_stemmed(kwds):
    """ filter the keywords: STEM, remove stopwords, lower and tokenize """
    return [t.text for t in stemmer(remove_stopwords(tolower(tokenize(kwds))))]


class SimpleIREntityAttributeMatcher(object):
    """
    Each instance of matcher recreates the index.
Exemple #10
0
import json
import logging

from whoosh.analysis import FancyAnalyzer, StemFilter
from whoosh.fields import Schema, ID, TEXT
from whoosh.index import create_in, open_dir, exists_in
from whoosh.qparser import QueryParser
from whoosh.query import Query, Variations

DEFAULT_SCHEMA = Schema(
    id=ID(unique=True, stored=True),
    data=ID(stored=True),
    content=TEXT(FancyAnalyzer() | StemFilter()),
)

logger = logging.getLogger(__name__)


class SearchIndex(object):
    def __init__(self, index_dir, schema=DEFAULT_SCHEMA, force_create=False):
        self.schema = schema
        if exists_in(index_dir) and not force_create:
            self.index = open_dir(index_dir, schema=schema)
        else:
            self.index = create_in(index_dir, schema=schema)

    @classmethod
    def is_empty(cls, index_dir):
        si = SearchIndex(index_dir)
        return si.index.is_empty()