def analyzeText(text): """ Analyzes the text to facilitate the searching of words. Creates an analizer for the Portuguese language which creates a lower case filter, a stop word filter and a stemming filter to process the text it receives. Parameters ---------- arg1 : string Document in simple text format Returns ------- string Document text after being processed """ languageAnalyzer = LanguageAnalyzer("pt") langText = "" for token in languageAnalyzer(text): langText += "".join(token.text) langText += " " return langText
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ (SimpleAnalyzer(), True), (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True), (StandardAnalyzer(), False), (StemmingAnalyzer(), False), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append((LanguageAnalyzer(lang_code), False)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append((NgramAnalyzer(4), False)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer, combine in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) words.update(new_words) # Add combined string to allow match against multiple word # entries allowing to combine up to 5 words if combine: words.update([ ' '.join(new_words[x:y]) for x in range(len(new_words)) for y in range(1, min(x + 6, len(new_words) + 1)) if x != y ]) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if '' in words: words.remove('') if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup dictionary = dictionary.filter(source__iregex=r'^({0})$'.format( '|'.join([re_escape(word) for word in words]))) return dictionary
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def get_words(self, unit): """ Returns list of word pairs for an unit. """ words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ StandardAnalyzer(), StemmingAnalyzer(), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append(LanguageAnalyzer(lang_code)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update( [token.text for token in analyzer(force_text(text))] ) except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) # Grab all words in the dictionary dictionary = self.filter( project=unit.translation.subproject.project, language=unit.translation.language ) if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup query = Q() for word in words: query |= Q(source__iexact=word) # Filter dictionary dictionary = dictionary.filter(query) return dictionary
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
def _get_schema(self, language): lang_analyzer = LanguageAnalyzer(language) return Schema( key=ID(stored=True, unique=True), assignee=ID(stored=True), reporter=ID(stored=True), status=ID(stored=True), summary=TEXT(analyzer=lang_analyzer, field_boost=2.0), description=TEXT(analyzer=lang_analyzer), comments_str=TEXT(analyzer=lang_analyzer), labels=KEYWORD(stored=True, lowercase=True), components=KEYWORD(stored=True, lowercase=True), )
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def index_all(): db_util.init_db() stemmer = Stemmer.Stemmer('russian') whoosh_ru_stemmer = RussianStemmer() analyzer = LanguageAnalyzer('russian') schema = Schema(transcription_id=ID(stored=True), transcript=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH): os.makedirs(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH) # recreate new index ix = create_in(const.TRANSCRIBED_WHOOSH_INDEX_DIR_PATH, schema) writer = ix.writer() for item in db_util.get_all_items(): writer.add_document(transcription_id=str(item.id).decode('utf-8'), transcript=item.transcription) writer.commit()
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
from __future__ import unicode_literals import copy from django.conf import settings from django.test import TestCase, override_settings from django.utils import timezone from wagtail.search.tests.test_backends import BackendTests from wagtail.tests.search import models from whoosh.analysis import LanguageAnalyzer sv_search_setttings_language = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) sv_search_setttings_language['default']['LANGUAGE'] = 'sv' analyzer_swedish = LanguageAnalyzer('sv') sv_search_setttings_analyzer = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) sv_search_setttings_analyzer['default']['ANALYZER'] = analyzer_swedish class TestWhooshSearchBackend(BackendTests, TestCase): backend_path = 'wagtail_whoosh.backend' def test_facet(self): pass def test_facet_tags(self): pass def test_facet_with_nonexistent_field(self): pass
def search(request): data = request.GET category_id = int(data.get('category_id', 0)) order = int(data.get('order', ORDER_BY_MOST_RECENT)) search_text = data.get('search_text', '').lower() tesis_services = TesisServices() total_full = list() tutors_full = list() all_full = tesis_services.get_by_category(category_id, order) if len(search_text) > 0: total_full, tutors_full = TesisServices.search_in_tesis( search_text, all_full) # Por cada busqueda, en la tabla de palabras buscadas, si la palabra existe se suma 1, sino se inserta con valor 1 # Si lo que se ingresa como búsqueda no es una sola pabla, sino una frase, se utiliza filtros tipo Stop y Stemming, # luego se realiza la extracción de keywords o tokens """ “Stop” words are words that are so common it’s often counter-productive to index them, such as “and”, “or”, “if”, etc. The provided analysis.StopFilter lets you filter out stop words, and includes a default list of common stop words. Stemming is a heuristic process of removing suffixes (and sometimes prefixes) from words to arrive (hopefully, most of the time) at the base word. """ if len(search_text.split()) > 1: analyzer = LanguageAnalyzer("es") a_filters = StopFilter() | StemFilter() keywords = list( set([ token.text for token in a_filters( analyzer(search_text, no_morph=True)) ])) else: keywords = [search_text] for word in keywords: obj, created = Searches.objects.update_or_create(word=word) if not created: obj.count += 1 else: if obj.count is None: obj.count = 1 obj.save() else: total_full = all_full # Se actualiza las palabras más buscadas # Se actualiza total de búsquedas y cantidad de palabras diferentes searches_services = SearchesServices() searches_services.generate_resume() top_words_searched = searches_services.top_words_searched # Total de palabras diferentes total_words = searches_services.total_words # Total de busquedas en el sitio total_searchs = searches_services.total_searchs # Paginado de lista de tesis paginator = Paginator(total_full, 5) page = request.GET.get('page') tesis_list = paginator.get_page(page) the_data = { 'tesis_list': render_to_string('sections/central_published_tesis.html', { 'tesis_list': tesis_list, 'question': search_text }), # serializers.serialize("json", [x for x in total_full]), 'tutors_list': tutors_full, 'top_words_searched': top_words_searched, 'total_words': total_words, 'total_searchs': total_searchs, 'question': search_text } # the_data = serializers.serialize("json", [x for x in total_full]) return JsonResponse(the_data)
import copy from django.conf import settings from django.test import TestCase, override_settings from wagtail.search.index import AutocompleteField from wagtail.search.tests.test_backends import BackendTests from wagtail.tests.search import models from whoosh.analysis import LanguageAnalyzer from whoosh.analysis.ngrams import NgramFilter sv_search_setttings_language = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) sv_search_setttings_language["default"]["LANGUAGE"] = "sv" analyzer_swedish = LanguageAnalyzer("sv") sv_search_setttings_analyzer = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) sv_search_setttings_analyzer["default"]["ANALYZER"] = analyzer_swedish indexing_resources = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) indexing_resources["default"]["MEMORY"] = 2048 indexing_resources["default"]["PROCS"] = 2 ngram_length = copy.deepcopy(settings.WAGTAILSEARCH_BACKENDS) ngram_length["default"]["NGRAM_LENGTH"] = (3, 9) class TestWhooshSearchBackend(BackendTests, TestCase): backend_path = "wagtail_whoosh.backend" def test_facet(self):
# customize highlight formatter class HighlightFormatter(Formatter): def format_token(self, text, token, replace=False): # Use the get_text function to get the text corresponding to the # token tokentext = get_text(text, token, replace) # Return the text as you want it to appear in the highlighted # string return "<mark>%s<mark>" % tokentext hf = HighlightFormatter() # formatter for highlighting wf = WholeFragmenter() # fragmenter for splitting words es_ana = LanguageAnalyzer("es") # Whoosh analyzer for Spanish # Load Whoosh index index = open_dir("whoosh_index") # Initialize Whoosh parser parser = QueryParser("text", schema=index.schema) @app.route("/") def load_index(): return render_template("index.html") @app.route("/api/greguerias/all/", methods=['GET']) def get_all_greguerias():
def indexer(data): stopwords_pt = get_stop_words() # for p in stopwords_pt: #print u (p) ana = LanguageAnalyzer("pt") schema = Schema(link=TEXT(stored=True), title=TEXT(stored=True, analyzer=ana), summary=TEXT(stored=True, analyzer=ana), content=TEXT(stored=True, analyzer=ana)) if not os.path.exists("pulledfeeds"): os.mkdir("pulledfeeds") ix = create_in("pulledfeeds", schema) else: ix = index.open_dir("pulledfeeds") writer = ix.writer() for item in data: if item['content'] == '': cont = u" " else: cont = item['content'] writer.add_document(link=item['link'], title=item['title'], summary=item['summary'], content=cont) writer.commit() #for w in stopwords.words('portuguese'): # print w for item in data: link = item['link'] title = item['title'] summary = item['summary'] content = item['content'] sentencesArray = [title, summary, content] person_list = [] for sentences in sentencesArray: sentence_sem_stop = [] tokens = nltk.tokenize.word_tokenize(sentences, 'portuguese') for w in tokens: p = w.lower() if p not in stopwords.words( 'portuguese') and p not in stopwords_pt: sentence_sem_stop.append(w) pos = nltk.pos_tag(sentence_sem_stop) sentt = nltk.ne_chunk(pos, binary=False) # print sentt person = [] name = "" for t in sentt: if hasattr(t, 'label') and t.label: #print t.label() if t.label() == 'PERSON' or t.label() == 'ORGANIZATION': for leaf in t.leaves(): person.append(leaf[0]) #if len(person) > 1: #avoid grabbing lone surnames for part in person: name += part + ' ' if name[:-1] not in person_list: person_list.append(name[:-1]) name = '' person = [] entidades = "| " for ent in person_list: entidades += ent + " | " if len(person_list) == 0: entidades = " none" save_ent.save_relations(person_list) entidades2 = "| " for ent in person_list: if save_ent.checkIfEntityWikiExists(ent): entidades2 += ent + " | " save_ent.save_entities(link, entidades) print "entidades ", entidades print "entidades2 ", entidades2 print "---------"
Time_GT = sw1_utils_query.GT_Q_read(directory_containing_Time_GT, True) # reading and storing queries for both datasets directory_containing_Cran_Q = '../Cranfield_DATASET/cran_Queries.tsv' directory_containing_Time_Q = '../Time_DATASET/time_Queries.tsv' Cran_Q = sw1_utils_query.GT_Q_read(directory_containing_Cran_Q, False) Time_Q = sw1_utils_query.GT_Q_read(directory_containing_Time_Q, False) ### ### Define a Text-Analyzer ### selected_analyzer = [ SimpleAnalyzer(), StandardAnalyzer(), LanguageAnalyzer('en') ] analyzer_names = ['Simple', 'Standard', 'Language'] ### ### Create a Schema ### datasets = ['Cranfield_DATASET', 'Time_DATASET'] datasets_len = [1400, 423] dir_idx_list = [] # list to save directories for indexes # for each of the datasets and for each of the analyzers create empty index # based on schema in the separate directory. # Then fill empty index through parsing dataset. Save directory into list. for idx in range(len(datasets)): if datasets[idx] == 'Cranfield_DATASET': for i in range(len(selected_analyzer)):
# num_added_records_so_far += 1 if (num_added_records_so_far % 100 == 0): print(" num_added_records_so_far= " + str(num_added_records_so_far)) # writer.commit() # it is necessary to store the index once filled in_file.close() # it is necessary to close the .csv file ''' Here "schemas" function is used to create and fill all the schemas(indexes) for both .csv files (Cranfield.csv and Time.csv) ''' analyzers = [StemmingAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), SimpleAnalyzer(), FancyAnalyzer(), NgramAnalyzer(4), KeywordAnalyzer(), LanguageAnalyzer('en')] # all the analyzers that are used analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names csv_names = ['Cranfield', 'Time'] # file names # start to iterate over all the .csv files (in particular the only two that there are, Cranfield.csv, and Time.csv) for name in csv_names: print(name, '\n\n') path = "C:./"+name+"_DATASET" # get the path where the .csv is stored for e,type_analyzer in enumerate(analyzers): # now the iteration is necessary to create the 8 different inverted indexes
def get_schema(lang=languages[10]): """ get_schema([lang="pt"]) Obtém o esquema a ser usado para a criação do índice de documentos. Por padrão, o esquema é carregado com o analisador de textos para o idioma Português. Mas, pode ser carregado para qualquer um dos idiomas suportados pela biblioteca Whoosh. Atualmente Whoosh suporta os seguintes idiomas: .. code-block:: python >>> from whoosh.lang import languages >>> languages ('ar', 'da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'no', 'pt', 'ro', 'ru', 'es', 'sv', 'tr') Os campos que compõem o índice de documentos não necessariamente precisam ser todos os campos que compõem o documento. Este esquema contém 4 campos - ``id``, ``pub_date``, ``title`` e ``body`` - dos 6 campos da classe :class:`apps.search.models.Article`, que define um documento neste projeto. O esquema do índice de documentos para a classe Article é o seguinte: .. code-block:: python Schema( id = ID(unique=True, stored=True), pub_date = DATETIME(stored=True), title = TEXT(stored=True, analyzer=LanguageAnalyzer(lang)), body = TEXT(stored=True, analyzer=LanguageAnalyzer(lang)), ) Os campos ``title`` e ``body``, por serem de tipo ``TEXT``, podem receber processamento textual que varia de acordo com o idioma. O idioma padrão deste método é o Português. O parâmetro ``lang`` permite alterar o idioma para um dos idiomas listados em :mod:`whoosh.lang.languages`. A escolha do idioma é importante para que a análise léxico-sintática sobre o texto seja feita corretamente. O analisador de textos :class:`LanguageAnalyzer` usa 3 filtros para o processamento textual nos campos ``title`` e ``body``: LowercaseFilter (converte para letras minúsculas), StopFilter (remove palavras irrelevantes) e StemFilter (converte para a raiz da palavra). Todos os campos do schema também são armazenados no índice de documentos. O parâmetro ``stored=True`` indica que os campos serão indexados e armazenados. O parâmetro ``unique`` informa que o campo é único. :param lang: Idioma do Schema. :type lang: str :returns: Schema """ return Schema( id=ID(unique=True, stored=True), pub_date=DATETIME(stored=True, sortable=True), url=TEXT(stored=True), source=TEXT(stored=True), title=TEXT(stored=True, sortable=True, analyzer=LanguageAnalyzer(lang)), body=TEXT(stored=True, analyzer=LanguageAnalyzer(lang)), links=TEXT(stored=True), )
def analyze(texto, lang=languages[10]): la = LanguageAnalyzer(lang) text_analyzed = ' '.join([token.text for token in la(texto)]) return text_analyzed
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": results = self.filter(source__search=reduce( lambda x, y: x | y, (SearchQuery(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.filter( project=unit.translation.component.project, language=unit.translation.language, )
def getSchema(): return Schema(path=STORED, id=STORED, body=TEXT(analyzer=LanguageAnalyzer("en")))
"version": whoosh.fields.ID(stored=True), "url_endpoint": whoosh.fields.ID(stored=True), "url_args": whoosh.fields.ID(stored=True) } part_fields = { "category": whoosh.fields.ID(stored=True), "id": whoosh.fields.ID(field_boost=3.0, stored=True), "table_indices": whoosh.fields.TEXT(), "url_endpoint": whoosh.fields.ID(stored=True), "url_args": whoosh.fields.ID(stored=True) } for lang in languages: doc_fields["title_%s" % lang] = whoosh.fields.TEXT( stored=True, field_boost=2.0, analyzer=LanguageAnalyzer(lang)) doc_fields["content_%s" % lang] = whoosh.fields.TEXT(analyzer=LanguageAnalyzer(lang)) part_fields["title_%s" % lang] = whoosh.fields.TEXT( stored=True, field_boost=2.0, analyzer=LanguageAnalyzer(lang)) part_fields["content_%s" % lang] = whoosh.fields.TEXT(analyzer=LanguageAnalyzer(lang)) doc_schema = whoosh.fields.Schema(**doc_fields) part_schema = whoosh.fields.Schema(**part_fields) doc_index = whoosh.index.create_in(whoosh_dir, doc_schema, indexname="docs") part_index = whoosh.index.create_in(whoosh_dir, part_schema, indexname="parts") doc_parsers = {}