def createSearchableData(directory, load_path): ''' Schema definition: title(name of file), path(as ID), content(indexed but not stored),textdata (stored text content) ''' # the call of the StemmingAnalyzer had to be changed in the whoosh directory to support the portuguese language my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter( minsize=2, maxsize=4) schema = Schema(question=TEXT(analyzer=my_analyzer, stored=True), response=TEXT(analyzer=my_analyzer, stored=True)) # schema = Schema(question=TEXT(stored=True), response=TEXT(stored=True)) schema.cachesize = -1 if not os.path.exists(directory): # makedirs is used to create directories with subdirectories in it os.makedirs(directory) # Creating a index writer to add document as per schema ix = create_in(directory, schema) writer = ix.writer(limitmb=1024) with open(load_path) as subtles_file: subtles_corpus = subtles_file.read().splitlines() for i in range(0, len(subtles_corpus), 2): writer.add_document(question=subtles_corpus[i], response=subtles_corpus[i + 1]) writer.commit()
def get_whoosh_index(force_create=False): from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer from whoosh.support.charset import accent_map analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4) schema = Schema( title = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), abstract = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), path = ID(unique=True, stored=True), authors = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), content = TEXT(analyzer=analyzer, stored=True), tags = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), status = KEYWORD, classname = KEYWORD, typeahead = TEXT(spelling=True, stored=True, phrase=False) ) if not os.path.exists(settings.WHOOSH_ROOT): os.mkdir(settings.WHOOSH_ROOT) if not exists_in(settings.WHOOSH_ROOT) or force_create: index = create_in(settings.WHOOSH_ROOT, schema) else: index = open_dir(settings.WHOOSH_ROOT) return index
def createSearchableData(): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() | CharsetFilter(charmap) schema = Schema(title=TEXT(stored= True, field_boost=3.0), ID= ID(stored=True, unique=True), url= TEXT(stored=True), textdata= TEXT(stored=True, analyzer= custom_analyzers, field_boost=0.8)) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = create_in("indexdir",schema) writer = ix.writer() path = os.path.relpath("/dump/dump_grande.xml", start="/") root = ET.parse(path) xml_data = {} for item in root.iter(): if item.tag == 'root': next elif item.tag == 'row' and len(xml_data) > 0: writer.add_document(title=xml_data['title'], ID=xml_data['id'], url=xml_data['url'], textdata=xml_data['text']) xml_data = {} else: xml_data[item.tag] = item.text writer.commit()
def __init__(self, **kwargs): super(WhooshEngine, self).__init__() analyzer = (StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=4, maxsize=10)) self.schema = Schema(id=ID(stored=True), title=TEXT(stored=True, field_boost=5.0, analyzer=analyzer), firstname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer), lastname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer), type=ID(stored=True), description=TEXT(stored=True, analyzer=analyzer), creators=TEXT(stored=False, analyzer=analyzer), tags=TEXT(stored=False, analyzer=analyzer), business_unit=TEXT(stored=False, analyzer=analyzer), position=TEXT(stored=False, analyzer=analyzer), competencies=TEXT(stored=False, analyzer=analyzer), text=TEXT(stored=True, analyzer=analyzer)) self.dir = kwargs['dir'] if not os.path.exists(self.dir): os.makedirs(self.dir) try: self._index = open_dir(self.dir) except EmptyIndexError: self._index = create_in(self.dir, self.schema)
def build_schema(self, fields): schema = super(FoldingWhooshSearchBackend, self).build_schema(fields) for name, field in schema[1].items(): if isinstance(field, TEXT) or isinstance(field, NGRAM) or isinstance(field, NGRAMWORDS): field.analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) | NgramFilter(minsize=2, maxsize=15) return schema
def analyze(text): my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) tokens = my_analyzer(text.strip()) words = [token.text for token in tokens] return words
def normalize_name(name): stem = True letters = list(name) # if format w o r k o u t / w.o.r.k.o.u.t/ w*o*r*k*o*u*t join togother if len(letters)>4: if len(set([letters[i] for i in range(0,len(letters),2)]))==1: name = "".join([letters[i] for i in range(1,len(letters),2)]) elif len(set([letters[i] for i in range(1,len(letters),2)]))==1: name = "".join([letters[i] for i in range(0,len(letters),2)]) # if there is and & not surrounded by spaces, leave alone (example 'r&b) if "&" in letters: position = letters.index("&") if position>0 and position<len(letters)-1: if letters[position-1]!=' ' and letters[position+1]!=' ': stem = False # if there is a k surrounded by numbers turn to 0 if "k" in letters and '2' in letters: positions = [x for x in range(len(letters)) if letters[x]=='k'] for pos in positions: if pos>0 and pos<len(letters)-1: if letters[pos-1]=='2': letters[pos]='0' name = "".join(letters) # proceed to stem if stem: my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) tokens = my_analyzer(name) words = [token.text for token in tokens] # if the reuslt is empyt, leave alone, if not, return as a list if len(words)!=0: result="" for el in words: result +=el+" " letters = list(result)[:-1] # softer stem else: name = name.lower() name = re.sub(r"[.,'\/#!$%\^\*;:{}=\_`~()@]", ' ', name) name = re.sub(r'\s+', ' ', name).strip() letters = list(name) # if last n characters are equal leave only 1 last = letters[-1] if last in ascii_letters and len(letters)>1: while(letters[-2]==last): letters.pop(-2) if len(letters)==1: break return ''.join(letters)
def _get_schema(): analyzer = StemmingAnalyzer() | CharsetFilter( accent_map ) # WARN: stemming is english specific; character folding is for western languages schema = Schema( code=ID(unique=True, stored=True), slug=ID(unique=False, stored=True), title=TEXT(analyzer=analyzer, stored=True), content=TEXT(analyzer=analyzer), ) return schema
def main(args): my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) for filename in glob.glob(args.dir + "/*.txt"): with open(filename, 'r') as readfile: infile = readfile.readlines() label = ("__label__" + os.path.splitext(os.path.basename(readfile.name))[0] + " ") if args.add_label else ''; for line in infile: tokens = my_analyzer(line.strip()) words = [token.text for token in tokens] #print(line.strip()) print(label + ' '.join(words))
def __init__(self): chfilter = CharsetFilter(accent_map) stoplist = stoplists["en"].union(stoplists["fr"]) analyzer = RegexTokenizer() | LowercaseFilter() | \ StopFilter(stoplist=stoplist) | chfilter # defines the schema # see http://pythonhosted.org/Whoosh/schema.html for reference keywordType = KEYWORD(lowercase=True, scorable=True) self.schema = Schema(content=TEXT(analyzer=analyzer), docType=TEXT, docId=ID(stored=True, unique=True), tags=keywordType) # Adds dynamic fields so each documents can index its fields in the # same Whoosh index self.schema.add('*_string', TEXT(analyzer=analyzer), glob=True) self.schema.add('*_date', DATETIME, glob=True) self.schema.add('*_number', NUMERIC, glob=True) self.schema.add('*_boolean', BOOLEAN, glob=True) # Creates the index folder and Whoosh index files if it doesn't exist # And loads the index in any case if not os.path.exists("indexes"): os.mkdir("indexes") self.index = index.create_in("indexes", self.schema) else: self.index = index.open_dir("indexes") # Creates the doctypes folder if it doesn't exist if not os.path.exists("doctypes"): os.mkdir("doctypes") # Creates the doctypes default schema file if it doesn't exist if not os.path.exists('doctypes/doctypes_schema.json'): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}") ''' Loads the doctypes schema if it's valid, otherwise recreates it Doctypes schema is a dictionary of doctypes with their fields created and updated when a document is indexed. That way, we can tell Whoosh which fields to search by default, because there is apparently no way to say "search in all fields". ''' with open('doctypes/doctypes_schema.json', 'r+') as rawJSON: try: self.doctypesSchema = json.load(rawJSON) except ValueError: rawJSON.write("{}") self.doctypesSchema = {}
def processText(text): """ Questo metodo si occupa di processare il testo prima di inserirlo nell'index. Nello specifico, elimina i caratteri di punteggiatura e scarta le parole lunghe solo una lettera. Inoltre, elimina anche le stopwords, esegue lo stemming delle parole e normalizza le lettere accentate ed altre lettere in testo appartenente all'ASCII :rtype: list """ #(, filterStopwords=False, stemming=False, normalizeAccents=False, minLength=1) # tokenizzazione # tokens = nltk.wordpunct_tokenize(text) # tokenizer = RegexTokenizer() # if stemming: # if filterStopwords: # analyzer = StemmingAnalyzer() # else: # analyzer = StemmingAnalyzer(stoplist=None) # else: # if filterStopwords: # analyzer = StandardAnalyzer() # else: # analyzer = StandardAnalyzer(stoplist=None) # if normalizeAccents: analyzer = StemmingAnalyzer() | CharsetFilter(charmap) # accent_map # Eliminazione di stopwords e punteggiatura processedText = [] for token in analyzer(text): tokenText = token.text.translate( str.maketrans('', '', string.punctuation)) if len(tokenText) > 1: processedText.append(tokenText) return processedText
def create_table(index_dir, *, overwrite=False): analyzer = StandardAnalyzer() | CharsetFilter(accent_map) schema = Schema(label=TEXT(stored=True, analyzer=analyzer, lang='fr'), rome=TEXT(stored=True, sortable=True), source=KEYWORD(stored=True, sortable=True), slug=STORED) if not os.path.exists(index_dir): os.mkdir(index_dir) elif exists_in(index_dir): if not overwrite: logger.critical( 'An index already exists in %s; overwrite flag not set; abandonning', index_dir) raise RuntimeError('Index already exists') logger.warning('Index already found, deleting %s to start anew', index_dir) shutil.rmtree(index_dir, ignore_errors=True, onerror=None) os.mkdir(index_dir) logger.info('Whoosh index %s ready for use', index_dir) create_in(index_dir, schema) return index_dir
def get_whoosh_field_type(cls, field, sortable=False): ''' Defines Whoosh field types used to define the schemas. See get_field_infos(). ''' # see http://pythonhosted.org/Whoosh/api/analysis.html#analyzers # see JIRA 165 from whoosh.fields import TEXT, ID, NUMERIC, BOOLEAN # TODO: shall we use stop words? e.g. 'A and B' won't work? from whoosh.analysis import SimpleAnalyzer, StandardAnalyzer, StemmingAnalyzer, CharsetFilter, RegexTokenizer from whoosh.support.charset import accent_map # ID: as is; SimpleAnalyzer: break into lowercase terms, ignores punctuations; StandardAnalyzer: + stop words + minsize=2; StemmingAnalyzer: + stemming # minsize=1 because we want to search for 'Scribe 2' # A paragraph or more. field_type = field['type'] if field_type == 'id': # An ID (e.g. 708-AB) # EXACT search only analyzer = None if field.get('multivalued', False): analyzer = RegexTokenizer(ur'\|', gaps=True) ret = ID(stored=True, sortable=sortable, analyzer=analyzer) elif field_type in ['int']: ret = NUMERIC(sortable=sortable) elif field_type in ['code']: # A code (e.g. K. 402, Royal 7.C.xii) # Accepts partial but exact search (e.g. royal) # See JIRA 358 # | is NECESSARY for multivalued fields ret = TEXT(analyzer=SimpleAnalyzer(ur'[/.\s()\u2013\u2014|-]', True), stored=True, sortable=sortable) elif field_type == 'title': # A title (e.g. British Library) # Accepts variants and partial search (e.g. 'libraries') ret = TEXT(analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map), stored=True, sortable=sortable) elif field_type == 'short_text': # A few words. ret = TEXT(analyzer=StemmingAnalyzer(minsize=2) | CharsetFilter(accent_map), stored=True, sortable=sortable) elif field_type == 'xml': # plain text derived from XML document ret = TEXT(analyzer=StemmingAnalyzer(minsize=2) | CharsetFilter(accent_map), stored=True, sortable=sortable) elif field_type == 'boolean': # 0|1 ret = NUMERIC(stored=True, sortable=sortable) else: ret = TEXT(analyzer=StemmingAnalyzer(minsize=2) | CharsetFilter(accent_map), stored=True, sortable=sortable) return ret
from whoosh.analysis import CharsetFilter, LowercaseFilter, NgramFilter, \ PathTokenizer, RegexTokenizer from whoosh.fields import DATETIME, ID, KEYWORD, NUMERIC, TEXT, FieldType, \ SchemaClass from whoosh.formats import Existence from whoosh.support.charset import accent_map from abilian.core.models.subjects import Group, User from abilian.core.util import noproxy from abilian.services.security.models import Anonymous, Role #: A Whoosh analyzer that splits on word boundaries and folds accents and case. accent_folder = ( RegexTokenizer(r'\w+') | # defaults doesn't split on '.' LowercaseFilter() | CharsetFilter(accent_map)) #: Analyzer for edge-ngrams, from 2 to 6 characters long edge_ngram = accent_folder | NgramFilter(minsize=2, maxsize=6, at='start') def EdgeNgramField(): return TEXT(stored=False, analyzer=edge_ngram) class _DefaultSearchSchema(SchemaClass): """General search schema.""" object_key = ID(stored=True, unique=True) id = NUMERIC(numtype=int, bits=64, signed=False, stored=True, unique=False) object_type = ID(stored=True, unique=False) creator = ID(stored=True)
from whoosh.fields import Schema, STORED, TEXT, ID import os.path from whoosh.index import create_in from whoosh.analysis import CharsetFilter, StemmingAnalyzer from whoosh import fields from whoosh.support.charset import accent_map # For example, to add an accent-folding filter to a stemming analyzer: my_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) schema = Schema(title=TEXT(analyzer=my_analyzer, spelling=True), titleStemmed=TEXT(analyzer=my_analyzer), content=TEXT(analyzer=my_analyzer, spelling=True), contentStemmed=TEXT(analyzer=my_analyzer), nid=ID(stored=True)) if not os.path.exists("index/index"): os.mkdir("index/index") ix = create_in("index/index", schema)
def create_index(index_dir): schema = Schema(book_abbr=STORED(), book_name=STORED(), book_tree=STORED(), book_kindle=STORED(), short=STORED(), long=STORED(), key_terms=STORED(), key_terms_content=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, STOP_WORDS) | CharsetFilter(accent_map)), book=ID(stored=True), heading=TEXT(stored=True, analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)), session=TEXT(stored=True, analyzer=StandardAnalyzer(minsize=1, stoplist=None)), date=DATETIME(stored=True, sortable=True), exact=TEXT(stored=True, analyzer=CleanupStandardAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)), stemmed=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re) | CharsetFilter(accent_map)), common=TEXT(stored=True, analyzer=CleanupStemmingAnalyzer(analyzer_re, stoplist=None) | CharsetFilter(accent_map)), ) ix = index.create_in(index_dir, schema) writer = ix.writer() for book in Books.indexed: with open("books/{}.txt".format(book['abbr']), encoding='utf-8') as f: text = pre_process_book(book, f.read()) text = re.search(book['book_re'], text, flags=re.DOTALL).group(1) d = { 'book_name': book['name'], 'book_abbr': book['abbr'], 'book_tree': book['tree'], 'book_kindle': book['kindle'], 'book': book['abbr'].lower(), } i = 0 heading_tiers = [{'short': '', 'long': ''}] * 3 carry_over_heading = None headings = list(filter(None, book['headings_re'].split(text)[1:])) for (__heading, _content) in zip(headings[::2], headings[1::2]): content = __heading + _content if carry_over_heading: content = carry_over_heading + content carry_over_heading = None heading = clean_heading(__heading) if 'heading_replacements' in book: for (pattern, repl) in book['heading_replacements']: heading = pattern.sub(repl, heading, 1) update_heading_tiers(book, heading_tiers, heading) has_content = re.search(r'[a-z]', _content) if not has_content: carry_over_heading = content continue add_document(writer, d, heading_tiers, content) i += 1 print(i) writer.commit() return ix
if tier_re['begin'] and re.search(tier_re['begin'], short_heading, flags=re.IGNORECASE): short, long = heading.split('\n') if '\n' in heading else (heading, '') tiers[tier_idx] = {'short': title(short), 'long': title(long)} if tier_re['end'] and re.search(tier_re['end'], short_heading, flags=re.IGNORECASE): tiers[tier_idx] = {'short': '', 'long': ''} # letters allowed, optionally interspersed with periods or asterisks, can't end with a number # if it's only numbers then it's fine to end with a number # term can't be adjacent to mid-line double asterisks # (remember that our pre-processing fixed *hello ho**w are you* to *hello how are you* already, so legitimate ones are safe) analyzer_re = re.compile(r'(?<![^\n]\*\*)\b(\w+([.*]?\w+)*(?<![0-9])|[0-9]+([.*]?[0-9]+)*)\b(?!\*\*[^\n])', re.UNICODE) search_schema = Schema(book=ID(), heading=TEXT(analyzer=StemmingAnalyzer(minsize=1, stoplist=None) | CharsetFilter(accent_map)), session=TEXT(analyzer=StandardAnalyzer(minsize=1, stoplist=None)), date=DATETIME(), exact=TEXT(analyzer=StandardAnalyzer(stoplist=None) | CharsetFilter(accent_map)), stemmed=TEXT(analyzer=StemmingAnalyzer() | CharsetFilter(accent_map)), common=TEXT(analyzer=StemmingAnalyzer(stoplist=None) | CharsetFilter(accent_map)), ) def create_index(index_dir): schema = Schema(book_abbr=STORED(), book_name=STORED(), book_tree=STORED(), book_kindle=STORED(), short=STORED(), long=STORED(),
from whoosh.analysis import StandardAnalyzer, CharsetFilter from whoosh.fields import * from whoosh.support.charset import accent_map INDEX_DIR = 'indexdir' PAGE_IDX_NAME = 'page_idx' MAIN_LANGS = ['cs', 'fi', 'sk'] # Tokenize and lowercase the input and remove accents analyzer = StandardAnalyzer() | CharsetFilter(accent_map) # Whoosh index schema - store ID and original page title, analyze title using above analyzer page_schema = Schema(id=NUMERIC(stored=True), title=TEXT(stored=True, analyzer=analyzer, spelling=True)) one_index_schema = Schema(original_title=TEXT(stored=True, analyzer=analyzer, spelling=True), source_lang=KEYWORD, target_lang=KEYWORD, translated=TEXT(stored=True)) def measure_execution_time(enabled): def execution_time(func): def wrapper(*args, **kwargs): start = None if enabled: start = datetime.datetime.now() result = func(*args, **kwargs) if start: print(f'{func.__name__}() took {datetime.datetime.now() - start}') return result wrapper.__doc__ = func.__doc__
def indexer(): charmap = charset_table_to_dict(default_charset) my_analyzer = StemmingAnalyzer() | CharsetFilter(charmap) | StopFilter() schema = Schema(url=ID(stored=True), title=TEXT(stored=True), content=TEXT(stored=True, analyzer=my_analyzer, spelling=True), data=STORED, tags=KEYWORD(stored=True), extension=TEXT(stored=True)) if not os.path.exists("everywhere"): os.mkdir("everywhere") if not os.path.exists("pdf"): os.mkdir("pdf") if not os.path.exists("doc"): os.mkdir("doc") if not os.path.exists("tar"): os.mkdir("tar") if not os.path.exists("jpg"): os.mkdir("jpg") if not os.path.exists("forms"): os.mkdir("forms") i_a = index.create_in("everywhere", schema) writer_a = i_a.writer() i_b = index.create_in("pdf", schema) writer_b = i_b.writer() i_c = index.create_in("doc", schema) writer_c = i_c.writer() i_d = index.create_in("tar", schema) writer_d = i_d.writer() i_e = index.create_in("jpg", schema) writer_e = i_e.writer() i_f = index.create_in("forms", schema) writer_f = i_f.writer() ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE specific = [ '.jpg', '.exe', '.pdf', '.doc', '.zip', '.xls', 'pptx', 'docx', 'r.gz', '.iso', 'jpeg', '.gif', '.png' ] ignore = ['calendar', 'events', 'mailto'] with open('intranet/crawled.txt', "r") as fp: num = 0 for line in fp: num = num + 1 print("Extracting link" + str(num)) line = line.replace('\n', '') #if line[-4: ] not in specific: if all(item not in line.lower() for item in ignore): try: if all(item not in line.lower() for item in specific): print(line) html = urlopen(line) soup = BeautifulSoup(html, "html.parser") #soup = BeautifulSoup(html, "html.parser") for script in soup(["script", "style"]): script.extract() try: heading = soup.title.string except AttributeError: heading = "line" #print (str(heading)) try: content = soup.body.get_text() except AttributeError: content = "" tags = "" try: for h in soup.findAll( ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']): tags = tags + " " + h.string except: pass else: #pattern = re.compile('[\W_]+') heading = line #heading = pattern.sub(' ',heading) #re.sub(r'[\W_]+','', heading) #heading = heading.split() content = line.split() tags = "" title = str(heading) #print (title) tags = str(tags) content = str(content) #print ("content") url = str(line) extension = str(line[-4:]) writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) if "pdf" in line.lower(): writer_b.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to pdf") elif (".doc" in line.lower()) or (".ppt" in line.lower()) or ( ".xls" in line.lower()) or ( "docx" in line.lower()) or (".ppt" in line.lower()): writer_c.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to doc") elif (".exe" in line.lower()) or (".iso" in line.lower()) or ( ".zip" in line.lower()) or ("r.gz" in line.lower()): writer_d.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to tar") elif (".jpeg" in line.lower()) or (".jpg" in line.lower()) or ( ".gif" in line.lower()) or (".png" in line.lower()): writer_e.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to jpg") elif "form" in line.lower(): writer_f.add_document(url=url, title=title, data=content, content=content, tags=tags, extension=extension) print("added to form") else: print("adding to everywhere") #writer_a.add_document(url=url, title=title, data=content, content=content, tags=tags) print("added To whoosh") except urllib.error.HTTPError: print("HTTP Error") # test = "True" except (ConnectionResetError, urllib.error.URLError): print("Connection Reset Fail") else: print("ignored this url") writer_a.commit() writer_b.commit() writer_c.commit() writer_d.commit() writer_e.commit() writer_f.commit()
def generateCombos(wordList): comboSet = set() formedSoFar = None for word in wordList: if formedSoFar: formedSoFar = (formedSoFar[0] + word[0], formedSoFar[1] and word[1]) else: formedSoFar = word comboSet.add(formedSoFar) comboSet.update(generateCombos(wordList[1:])) return comboSet textAnalyzer = RegexTokenizer() | CharsetFilter(accent_map) keywordAnalyzer = SpaceSeparatedTokenizer() | CharsetFilter(accent_map) schema = Schema( ayah=STORED, simple_ayah=TEXT(stored=True, analyzer=textAnalyzer), surah_num=NUMERIC(stored=True), ayah_num=NUMERIC(stored=True), roots=KEYWORD(scorable=True, analyzer=keywordAnalyzer), decomposed_ayah=KEYWORD(scorable=True, analyzer=keywordAnalyzer), surah_name_ar=STORED, surah_name_en=STORED, ) if not os.path.exists("whooshdir"): os.mkdir("whooshdir")