Esempi in Python per NGRAMWORDS, esempi in Python per whoosh.fields.NGRAMWORDS

Esempio n. 1

0

Mostra file

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True,
                                                                         field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float,
                                                                     field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored,
                                                                   field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
                                                                        stored=field_class.stored,
                                                                        field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),
                                                                  field_boost=field_class.boost, sortable=True)

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search.")

        return (content_field_name, Schema(**schema_fields))

Esempio n. 2

0

Mostra file

File: concept_search.py Progetto: alexandereof/transmart-api-client-py

    def __build_whoosh_index(self, schema_dir):

        fields = dict(
            node=TEXT(),
            fullname=TEXT(stored=True),
            path=TEXT(),
            type=NGRAM(minsize=4),
            study=NGRAM(field_boost=10.0),
            name=NGRAMWORDS(minsize=3, field_boost=3.0),
            metadata=NGRAMWORDS(minsize=3),
        )
        schema = Schema(**fields)
        self.ix = create_in(schema_dir, schema)

        with self.ix.writer(procs=2, multisegment=True, limitmb=512) as writer:
            for key, value in self._tree_dict.items():
                writer.add_document(node=key.replace('\\',
                                                     ' ').replace('_', ' '),
                                    path=value.get('conceptPath'),
                                    fullname=key,
                                    type=value.get('type'),
                                    study=str(value.get('studyId')),
                                    name=str(value.get('name')),
                                    metadata=str(value.get('metadata')))

Esempio n. 3

0

Mostra file

File: backend.py Progetto: joonis/wagtail-whoosh

    def _to_whoosh_field(self, field, field_name=None):
        # If the field is AutocompleteField or has partial_match field, treat it as auto complete field
        if isinstance(field, AutocompleteField) or \
                (hasattr(field, 'partial_match') and field.partial_match):
            whoosh_field = NGRAMWORDS(stored=False,
                                      minsize=self.ngram_length[0],
                                      maxsize=self.ngram_length[1],
                                      queryor=True)
        else:
            # TODO other types of fields https://whoosh.readthedocs.io/en/latest/api/fields.htm
            whoosh_field = TEXT(
                stored=False,
                field_boost=get_boost(field),
                lang=self.language,
                analyzer=self.analyzer,
            )

        if not field_name:
            field_name = _get_field_mapping(field)
        return field_name, whoosh_field

Esempio n. 4

0

Mostra file

File: store.py Progetto: mrvanes/pyFFplus

 def _setup(self):
     self._redis = getattr(self, '_redis', None)
     if not self._redis:
         self._redis = redis(
         )  # XXX test cases won't get correctly unpicked because of this
     self.schema = Schema(content=NGRAMWORDS(stored=False))
     self.schema.add("object_id", ID(stored=True, unique=True))
     self.schema.add("entity_id", ID(stored=True, unique=True))
     self.schema.add('sha1', ID(stored=True, unique=True))
     for a in list(ATTRS.keys()):
         self.schema.add(a, KEYWORD())
     self.objects = self.xml_dict('objects')
     self.parts = self.json_dict('parts')
     self.storage = FileStorage(os.path.join(self._dir, self._name))
     try:
         self.index = self.storage.open_index(schema=self.schema)
     except BaseException as ex:
         log.warn(ex)
         self.storage.create()
         self.index = self.storage.create_index(self.schema)
         self._reindex()

Esempio n. 5

0

Mostra file

class Indexer:

    schema = Schema(
        tagSubject=ID(stored=True),
        #tagPrefLabel=TEXT(stored=True),
        tagPrefLabel=NGRAMWORDS(minsize=2,
                                maxsize=10,
                                stored=True,
                                field_boost=1.0,
                                tokenizer=None,
                                at='start',
                                queryor=False,
                                sortable=False),
        termPrefLabel=NGRAMWORDS(minsize=2,
                                 maxsize=10,
                                 stored=True,
                                 field_boost=1.0,
                                 tokenizer=None,
                                 at='start',
                                 queryor=False,
                                 sortable=False),
        termAltLabel=NGRAMWORDS(minsize=2,
                                maxsize=10,
                                stored=True,
                                field_boost=1.0,
                                tokenizer=None,
                                at='start',
                                queryor=False,
                                sortable=False),
        termBroader=NGRAMWORDS(minsize=2,
                               maxsize=10,
                               stored=True,
                               field_boost=1.0,
                               tokenizer=None,
                               at='start',
                               queryor=False,
                               sortable=False),
        termNarrower=NGRAMWORDS(minsize=2,
                                maxsize=10,
                                stored=True,
                                field_boost=1.0,
                                tokenizer=None,
                                at='start',
                                queryor=False,
                                sortable=False),
        tagScopeNote=TEXT(stored=True),
        spellingEN=TEXT(stored=True, spelling=True),
        spellingDE=TEXT(stored=True, spelling=True))

    __writer = None

    wordSetEN = set()
    wordSetDE = set()

    def __init__(self, rdfGraph):
        if rdfGraph is None: return
        self.createNewIndex()

        count = 0
        for subject, predicate, obj in rdfGraph.graph:
            if rdfGraph.isInKeyScheme(subject) or rdfGraph.isInTagScheme(
                    subject):
                if predicate == SKOS.prefLabel:
                    count += 1
                    print str(count) + ': Indexing tagPrefLabel: ' + str(obj)
                    label = utils.wsWord(obj)
                    lit = Literal(label, obj.language)
                    self.addTagPrefLabel(subject, lit)
                elif predicate == SKOS.scopeNote:
                    count += 1
                    print str(count) + ': Indexing tagScopeNote: ' + str(obj)
                    self.addTagScopeNote(subject, obj)

            elif rdfGraph.isInTermScheme(subject):
                tagSubjectList = self.getTagsOfRelTerm(rdfGraph, subject)

                if predicate == SKOS.prefLabel:
                    count += 1
                    lang = obj.language
                    if lang == 'en' or lang == 'de':
                        print str(count) + ': Indexing termPrefLabel: ' + str(
                            obj)
                        self.addTermPrefLabel(tagSubjectList, obj)
                if predicate == SKOS.altLabel:
                    count += 1
                    lang = obj.language
                    if lang == 'en' or lang == 'de':
                        print str(count) + ': Indexing termAltLabel: ' + str(
                            obj)
                        self.addTermAltLabel(tagSubjectList, obj)
                if predicate == SKOS.broader:
                    count += 1
                    lang = obj.language
                    if lang == 'en' or lang == 'de':
                        print str(count) + ': Indexing termBroader: ' + str(
                            obj)
                        self.addTermBroader(tagSubjectList, obj)
                if predicate == SKOS.narrower:
                    count += 1
                    lang = obj.language
                    if lang == 'en' or lang == 'de':
                        print str(count) + ': Indexing termNarrower: ' + str(
                            obj)
                        self.addTermNarrower(tagSubjectList, obj)

        self.addSpellings()

        self.commit()

    splitChars = re.compile('[ ="._,:;/\?\(\)\]\[\!\*]')

    def addToWordList(self, words, filterShort=None):
        lang = words.language
        wordList = self.splitChars.split(words)
        for word in wordList:
            if len(word) <= 1:
                continue

            if filterShort and len(
                    word
            ) <= filterShort:  # skips short lowercased words if 'filterShort'
                continue

            if lang == 'en':
                word = utils.eszettToSS(word)
                self.wordSetEN.add(word)
            elif lang == 'de':
                word = utils.eszettToSS(word)
                self.wordSetDE.add(word)
            else:
                translator = Translator()
                if not word in self.wordSetDE and word not in self.wordSetEN:
                    try:
                        transWordDE = translator.translateENToDE(word)
                        transWordDE = utils.eszettToSS(transWordDE)
                        self.wordSetDE.add(transWordDE)
                        self.wordSetEN.add(utils.eszettToSS(word))
                    except:
                        pass

    def addSpellings(self):
        countEN = 0
        countDE = 0
        for word in self.wordSetEN:
            countEN += 1
            print str(countEN) + ': Indexing EN spelling for word: ' + word
            self.__writer.add_document(spellingEN=unicode(word))

        for word in self.wordSetDE:
            countDE += 1
            print str(countDE) + ': Indexing DE spelling for word: ' + word
            self.__writer.add_document(spellingDE=unicode(word))

    def getTagsOfRelTerm(self, rdfGraph, relTermSubject):
        '''Returns a list of subjects, that point to this RelatedTerm 'subject'.'''
        generatorList = rdfGraph.getRelatedMatch(relTermSubject)
        return utils.genToList(generatorList)

    def createNewIndex(self):
        ix = create_in(utils.indexerDir(),
                       self.schema,
                       indexname=utils.indexName)
        self.__writer = ix.writer()

    def addTagPrefLabel(self, tagSubject, tagPrefLabel):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        self.__writer.add_document(tagSubject=unicode(tagSubject),
                                   tagPrefLabel=unicode(tagPrefLabel))
        self.addToWordList(tagPrefLabel)

    def addTagScopeNote(self, tagSubject, tagScopeNote):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        self.__writer.add_document(tagSubject=unicode(tagSubject),
                                   tagScopeNote=unicode(tagScopeNote))
        self.addToWordList(tagScopeNote, 5)

    def addTermPrefLabel(self, tagSubjectList, termPrefLabel):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        for tagSubject in tagSubjectList:
            self.__writer.add_document(tagSubject=unicode(tagSubject),
                                       termPrefLabel=unicode(termPrefLabel))
        self.addToWordList(termPrefLabel)

    def addTermAltLabel(self, tagSubjectList, termAltLabel):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        for tagSubject in tagSubjectList:
            self.__writer.add_document(tagSubject=unicode(tagSubject),
                                       termAltLabel=unicode(termAltLabel))
        self.addToWordList(termAltLabel)

    def addTermBroader(self, tagSubjectList, termBroader):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        for tagSubject in tagSubjectList:
            self.__writer.add_document(tagSubject=unicode(tagSubject),
                                       termBroader=unicode(termBroader))
        self.addToWordList(termBroader)

    def addTermNarrower(self, tagSubjectList, termNarrower):
        if not index.exists_in(utils.indexerDir(), utils.indexName):
            self.createNewIndex()
        for tagSubject in tagSubjectList:
            self.__writer.add_document(tagSubject=unicode(tagSubject),
                                       termNarrower=unicode(termNarrower))
        self.addToWordList(termNarrower)

    def commit(self):
        self.__writer.commit()

Esempio n. 6

0

Mostra file

File: Indexing_with_Word2Vec_04.py Progetto: FrancoSwiss/DeepLearningforSearch

    """
    doc = WordPunctTokenizer().tokenize(txt)
    doc = [word for word in doc if word not in stopword_set]
    doc = [word for word in doc if word.isalpha()]
    return doc


stopword_set = set(stopwords.words('german'))

# Index schema
analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
if USE_NGRAM:
    schema = Schema(title=TEXT(stored=True),
                    body=TEXT(analyzer=analyzer, stored=True),
                    ngrams=NGRAMWORDS(minsize=NGRAM_MIN_SIZE,
                                      maxsize=NGRAM_MAX_SIZE,
                                      stored=False,
                                      at=None))
else:
    schema = Schema(title=TEXT(stored=True),
                    body=TEXT(analyzer=analyzer, stored=True))

# Empty index folder if needed
if os.path.exists(INDEX_PATH):
    shutil.rmtree(INDEX_PATH, ignore_errors=True)
os.mkdir(INDEX_PATH)

# Remove possible Word2Vec remains
for f in glob.glob(OUTPUT_W2VMODEL_BIN + '.*'):
    os.remove(f)

# Get list of files to process

Esempio n. 7

0

Mostra file

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        initial_key_count = len(schema_fields)
        content_field_name = ''
        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)

            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at='start',
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            else:
                # schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=StemmingAnalyzer(), field_boost=field_class.boost, sortable=True)
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=ChineseAnalyzer(),
                    field_boost=field_class.boost,
                    sortable=True)
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )
        return (content_field_name, Schema(**schema_fields))

Esempio n. 8

0

Mostra file

File: ror_search.py Progetto: ourresearch/jump-api

import unicodecsv as csv
from whoosh import index, sorting
from whoosh.analysis import StandardAnalyzer
from whoosh.fields import Schema, STORED, NGRAMWORDS, NUMERIC
from whoosh.qparser import MultifieldParser

_schema = Schema(
    ror=STORED(),
    grid=STORED(),
    name=NGRAMWORDS(stored=False),
    aliases=NGRAMWORDS(stored=False),
    num_students=NUMERIC(int, sortable=True, stored=False),
    citation_score=NUMERIC(int, sortable=True, stored=False),
)

_index_path = 'data/ror-whoosh-index'


def _read_ror_csv_rows():
    rows = []
    with open('data/ror-metrics.csv') as ror_csv:
        reader = csv.DictReader(ror_csv)
        for row in reader:
            row['aliases'] = row['aliases'].split(
                u'###') if row['aliases'] else []
            row['num_students'] = int(
                row['num_students']) if row['num_students'] else None
            row['citation_score'] = float(
                row['citation_score']) if row['citation_score'] else None
            rows.append(row)

Esempio n. 9

0

Mostra file

File: SSAWGParse.py Progetto: sot/text-tools

if not os.path.exists(SSAWG_index_dir):
    print("Creating index folder...")
    os.mkdir(SSAWG_index_dir)

## Add fields programmatically by parsing the first line of the file

Searchable = ('Text', )

ix = index.create_in(SSAWG_index_dir, schema)
writer = ix.writer()
for field in fieldnames:
    if field in Searchable:
        print(field)
        writer.add_field(field,
                         NGRAMWORDS(minsize=NgramMin,
                                    maxsize=NgramMax,
                                    stored=True)
                         )  # May need to adjust size to allow for description
    else:
        writer.add_field(field, TEXT(stored=True, chars=True))
mtgCnt = 0
for Meeting in Meetings:  # Text is NGRAMMED, link is stored
    #print('-----------------')
    #print(str(Meeting.text))
    StrippedText = ''
    for item in Meeting.find_all('li'):  # for each list item...
        if item.text:
            CurStrip = item.text
        else:
            CurStrip = ''
        StrippedText += CurStrip.strip() + '\n'

Esempio n. 10

0

Mostra file

File: ingest.py Progetto: deezer/muzeeglot

            key = f'{eid}:{locale}:tags'
            for tag in tags['values']:
                storage.lpush(key, tag)


if __name__ == '__main__':
    print('-' * 30)
    print('Muzeeglot data ingestion')
    print('-' * 30)
    if exists(configuration.INGESTION_LOCK):
        print('WARN: ingestion lock detected, pass')
    else:
        print('INFO: evaluate tags corpus')
        tags_corpus = get_tags_corpus()
        print('INFO: create search index')
        if not exists(configuration.INDEX):
            makedirs(configuration.INDEX)
        schema = Schema(ngram=NGRAMWORDS(), name=STORED(), eid=STORED())
        index = create_in(configuration.INDEX, schema)
        writer = BufferedWriter(index, period=60, limit=200)
        ingest_languages(writer)
        ingest_tags(tags_corpus)
        ingest_entities(tags_corpus, writer)
        print('INFO: optimize and close index')
        writer.close()
        index.optimize()
        index.close()
        print('INFO: write ingestion lock')
        with open(configuration.INGESTION_LOCK, 'w') as stream:
            stream.write('ingested')

Esempio n. 11

0

Mostra file

import os, time, threading
from whoosh.fields import Schema, KEYWORD, NGRAMWORDS, NUMERIC, TEXT
from whoosh.index import create_in, open_dir
from whoosh.writing import AsyncWriter
from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin, FieldAliasPlugin
#from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer
#from whoosh.query import *

#https://whoosh.readthedocs.io/en/latest/quickstart.html
schema = Schema(id=NUMERIC(stored=True, unique=True, signed=False),
                category=TEXT,
                title=NGRAMWORDS(2, 20, True, 2.0),
                ingredients=KEYWORD,
                content=NGRAMWORDS(4, 20))

#TODO: Synonyme https://whoosh.readthedocs.io/en/latest/api/lang/wordnet.html
search_path = "search"
ALWAYS_REBUILD = False
min_search_length = 2

if not os.path.exists(search_path):
    os.mkdir(search_path)


def rebuild_index():
    index = create_in(search_path, schema)
    writer = index.writer()
    writer.add_document(id=0, title="Test Words", content="super nice")
    writer.add_document(id=1, title="Apple Banana Cucumber")
    writer.add_document(id=2, title="Deck Elevator Floor", category="test")
    writer.add_document(id=3, title="Pen Pineapple Apple Pen")

Esempio n. 12

0

Mostra file

File: backends.py Progetto: animalcharityevaluators/acerl

    def build_schema(self, fields):
        # Copied from https://github.com/django-haystack/django-haystack/blob/v2.8.1/haystack/backends/whoosh_backend.py
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ""

        for field_name, field_class in fields.items():
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = WHOOSH_ID(
                        stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(
                        stored=True,
                        commas=True,
                        scorable=True,
                        field_boost=field_class.boost)
            elif field_class.field_type in ["date", "datetime"]:
                schema_fields[field_class.index_fieldname] = DATETIME(
                    stored=field_class.stored, sortable=True)
            elif field_class.field_type == "integer":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=int,
                    field_boost=field_class.boost)
            elif field_class.field_type == "float":
                schema_fields[field_class.index_fieldname] = NUMERIC(
                    stored=field_class.stored,
                    numtype=float,
                    field_boost=field_class.boost)
            elif field_class.field_type == "boolean":
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(
                    stored=field_class.stored)
            elif field_class.field_type == "ngram":
                schema_fields[field_class.index_fieldname] = NGRAM(
                    minsize=3,
                    maxsize=15,
                    stored=field_class.stored,
                    field_boost=field_class.boost)
            elif field_class.field_type == "edge_ngram":
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(
                    minsize=2,
                    maxsize=15,
                    at="start",
                    stored=field_class.stored,
                    field_boost=field_class.boost,
                )
            else:
                schema_fields[field_class.index_fieldname] = TEXT(
                    stored=True,
                    analyzer=getattr(field_class, "analyzer",
                                     StemmingAnalyzer()),
                    field_boost=field_class.boost,
                    sortable=True,
                )
                schema_fields[
                    field_class.index_fieldname].field_name = field_name

            if field_class.document is True:
                content_field_name = field_class.index_fieldname
                schema_fields[field_class.index_fieldname].spelling = True

        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError(
                "No fields were found in any search_indexes. Please correct this before attempting to search."
            )

        return (content_field_name, Schema(**schema_fields))

Esempio n. 13

0

Mostra file

File: ngram_test.py Progetto: jonaheinke/recipe_manager

from whoosh.fields import Schema, TEXT, STORED, NGRAMWORDS
from whoosh.index import create_in, open_dir
from whoosh.analysis import StandardAnalyzer, StemmingAnalyzer, NgramFilter, NgramAnalyzer, NgramWordAnalyzer
#from whoosh.query import *
from whoosh.qparser import QueryParser, MultifieldParser, FieldsPlugin

analyzer = NgramAnalyzer(3)
schema = Schema(
    id=STORED,
    category=TEXT(field_boost=3.0),
    #title = TEXT(analyzer, False)
    title=NGRAMWORDS(2, 20, False, 2.0))

index = create_in("search", schema)
#index = open_dir("search")

writer = index.writer()
writer.add_document(id=0, title="Test Words")
writer.add_document(id=1, title="Apple Banana Cucumber")
writer.add_document(id=2, title="Deck Elevator Floor", category="test")
writer.add_document(id=3, title="Pen Pineapple Apple Pen")
writer.commit()

#parser = QueryParser("title", schema)
parser = MultifieldParser(["category", "title"], schema, {
    "category": 3.0,
    "title": 2.0
})
parser.remove_plugin_class(FieldsPlugin)

with index.searcher() as searcher:

Esempio n. 14

0

Mostra file

json_data_dir = '/srv/wallacewebapp/climasng/data'
species_json_file = 'species.json'
summaries_json_file = 'summaries.json'

search_index_dir = os.path.join(json_data_dir, 'searchindex')

if os.path.isdir('/Users/pvrdwb'):

    # ..overwrite with local dev paths
    json_data_dir = '/Users/pvrdwb/projects/climas-global/webapp/climasng/data'
    search_index_dir = os.path.join(json_data_dir, 'searchindex')

# define schema for indexed info
schema = Schema(nice_name=NGRAMWORDS(2,
                                     8,
                                     at='start',
                                     sortable=True,
                                     stored=True),
                name_id=ID(stored=True, unique=True),
                item_id=ID(stored=True),
                item_path=STORED,
                item_type=KEYWORD(stored=True))

debug_output_level = 3  # max 5 (only errors show)


# -------------------------------------------------------------------
def msg(message, debug_level=3):
    ''' debug level goes from 1 (very minor) to 5 (massive problem) '''
    if debug_level >= debug_output_level:
        prefix = ['', 'dbug', 'info', 'mesg', 'warn', 'BOOM'][debug_level]