def CreateQueryParser(): global qp og = qparser.OrGroup.factory(0.9) qp = qparser.MultifieldParser(["Title", "ParaText"], schema=ix.schema, group=og) qp.add_plugin(FuzzyTermPlugin())
def searcher(index_path, query): ix = open_dir(index_path) searcher = ix.searcher() parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) my_query = parser.parse(query) results = searcher.search(my_query, limit=None) for result in results: print(result['content'])
def __init__(self, articles_path): """Attempt to initialize a folder with Markdown articles. If a git repo, create a search index and populate. Markdown Extension References * http://facelessuser.github.io/pymdown-extensions * https://pythonhosted.org/Markdown/extensions """ self.article_repo = Repo(articles_path) self.articles_path = articles_path self.markdown_extensions = [ 'markdown.extensions.abbr', 'markdown.extensions.attr_list', 'markdown.extensions.def_list', 'markdown.extensions.fenced_code', 'markdown.extensions.footnotes', 'markdown.extensions.tables', 'markdown.extensions.smart_strong', 'markdown.extensions.admonition', 'markdown.extensions.codehilite', 'markdown.extensions.headerid', 'markdown.extensions.sane_lists', 'markdown.extensions.smarty', 'markdown.extensions.toc', 'markdown.extensions.wikilinks', 'pymdownx.betterem', 'pymdownx.caret', 'pymdownx.githubemoji', 'pymdownx.headeranchor', 'pymdownx.magiclink', 'pymdownx.mark', 'pymdownx.smartsymbols', 'pymdownx.tasklist', 'pymdownx.tilde', 'pymdownx.critic', ] self.markdown_extensions_config = { 'markdown.extensions.codehilite': { 'css_class': 'code-highlight' } } self.__search_schema = Schema( title=ID(stored=True, unique=True), path=ID(stored=True), content=TEXT, ) self.__search_parser = MultifieldParser( ['title', 'content'], schema=self.__search_schema, ) self.__search_parser.add_plugin(FuzzyTermPlugin()) self.__search_index = self.create_search_index() self.populate_search_index()
def __init__(self, url, headers, rows): self.url = url self.headers = headers self.rows = rows self.schema = Schema( name=TEXT(stored=False), alternative_names=TEXT(stored=False), id=ID(stored=True) ) self.index = RamStorage().create_index(self.schema) for c in [NAME_HEADER, ALT_NAMES_HEADER, TYPE_HEADER]: assert c in self.headers, 'Required "{}" column not found in {}'.format(c, url) name_idx = self.headers.index(NAME_HEADER) alt_names_idx = self.headers.index(ALT_NAMES_HEADER) writer = self.index.writer() for idx, row in enumerate(self.rows): name = row[name_idx] alt_names = row[alt_names_idx] writer.add_document( name=str(name), alternative_names=str(alt_names), id=str(idx) ) writer.commit() parser = QueryParser("name", self.index.schema) self.exact_name_query_parser = parser parser = QueryParser("name", self.index.schema) parser.add_plugin(FuzzyTermPlugin()) self.name_query_parser = parser parser = QueryParser("alternative_names", self.index.schema) parser.add_plugin(FuzzyTermPlugin()) self.alt_names_query_parser = parser
def search(): print(request.args) search = request.args.get('search') author = request.args.get('author') category = request.args.get('category') page = int(request.args.get( 'page')) if not request.args.get('page') is None else 1 print(search) if search is None and author is None and category is None: myquery = Every() else: if search is None: if not author is None: myquery = Term('author', author) if not category is None: myquery = myquery & Term('category', category) else: myquery = Term('category', category) else: myquery = MultifieldParser(["title", "post_content"], ix.schema, plugins=[FuzzyTermPlugin() ]).parse(search) if not author is None: myquery = myquery & Term('author', author) if not category is None: myquery = myquery & Term('category', category) with ix.searcher() as searcher: results = searcher.search_page(myquery, page, pagelen=25, sortedby="date", reverse=True) print(results.is_last_page()) results_json = json.dumps( { "results": [dict(i) for i in results], "page": page, "total_results": results.total }, default=str) resp = Response(response=results_json, status=200, mimetype="application/json") return resp
def inicia(): pth = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/indiceJuego") if not os.path.exists(pth): os.mkdir(pth) esquemaJuego = Schema(titulo=KEYWORD(stored=True), descripcion=TEXT, categorias=KEYWORD(stored=True), plataformas=KEYWORD(stored=True), precio=NUMERIC(stored=True)) indiceJuego = create_in("indiceJuego",esquemaJuego) else: indiceJuego = open_dir(pth) parser = MultifieldParser(["titulo"], schema=indiceJuego.schema) parser.add_plugin(FuzzyTermPlugin()) return indiceJuego,parser
def search_in_index(search_kw, index): parser = MultifieldParser(["content", "title"], index.schema) parser.add_plugin(FuzzyTermPlugin()) searcher = index.searcher() to_parse = ' '.join([i + '~0' for i in search_kw.split(' ')]) myquery = parser.parse(to_parse) r = searcher.search(myquery) results = [] for res in r: results.append(res['path']) corrector = searcher.corrector("content") suggestions = [] for kw in search_kw.split(' '): suggestions.append(corrector.suggest(kw)) searcher.close() return results, suggestions
def match(query_str, idx, limit=40): ret_results = [] query_words = words_get(query_str) if len(query_words) == 0: return ret_results with idx.searcher() as searcher: rome_facet = sorting.FieldFacet('rome') # Strict search, with forced correction parser = QueryParser('label', idx.schema) query = parser.parse(f'{query_str}') cor = searcher.correct_query(query, query_str) results = searcher.search(cor.query, limit=20, collapse=rome_facet) # Word-joker search parser = QueryParser('label', idx.schema) query = parser.parse(f'{query_str}*') results_partial = searcher.search(query, limit=20, collapse=rome_facet) results.upgrade_and_extend(results_partial) # Fuzzy search parser = QueryParser('label', idx.schema, termclass=CustomFuzzyTerm) parser.add_plugin(FuzzyTermPlugin()) shortword = re.compile(r'\W*\b\w{1,3}\b') query_prep = shortword.sub('', query_str) query = parser.parse(query_prep) results_fuzzy = searcher.search(query, limit=limit, collapse=rome_facet) results.upgrade_and_extend(results_fuzzy) for res in results: ret_results.append({ 'id': res['rome'], 'label': res['label'], 'value': res['label'], 'occupation': res['slug'], 'source': res['source'], 'score': res.score }) return sorted(ret_results, key=lambda e: e['score'], reverse=True)
def get_schema(self): schema_dir = os.path.join(cache_dir, self.id_) os.makedirs(schema_dir, exist_ok=True) if exists_in(schema_dir) and open_dir(schema_dir).doc_count() != 0: self.ix = open_dir(schema_dir) print('Existing index cache found. Loaded {} tree nodes. Hooray!'. format(self.ix.doc_count())) else: print('No valid cache found. Building indexes...') now = time.time() self.__build_whoosh_index(schema_dir) print('Finished in {:.2f} seconds'.format(time.time() - now)) self.parser = MultifieldParser(self.ix.schema.names(), schema=self.ix.schema) self.parser.add_plugin(FuzzyTermPlugin())
def test_fuzzy_prefix(): from whoosh import scoring schema = fields.Schema(title=fields.TEXT(stored=True), content=fields.TEXT(spelling=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: # Match -> first w.add_document(title=u("First"), content=u("This is the first document we've added!")) # No match w.add_document( title=u("Second"), content=u("The second one is even more interesting! filst")) # Match -> first w.add_document(title=u("Third"), content=u("The world first line we've added!")) # Match -> zeroth w.add_document( title=u("Fourth"), content=u("The second one is alaways comes after zeroth!")) # Match -> fire is within 2 edits (transpose + delete) of first w.add_document(title=u("Fifth"), content=u("The fire is beautiful")) from whoosh.qparser import QueryParser, FuzzyTermPlugin parser = QueryParser("content", ix.schema) parser.add_plugin(FuzzyTermPlugin()) q = parser.parse("first~2/3 OR zeroth", debug=False) assert isinstance(q, query.Or) ft = q[0] assert isinstance(ft, query.FuzzyTerm) assert ft.maxdist == 2 assert ft.prefixlength == 3 with ix.searcher(weighting=scoring.TF_IDF()) as searcher: results = searcher.search(q) assert len(results) == 4 assert (" ".join(sorted( hit["title"] for hit in results)) == "Fifth First Fourth Third")
def __init__(self, location): """ create a new redis store, the location given will be used to generate keys this keys will be combined to get/set instance config Args: location (Location) """ super().__init__(location, Serializer()) config = self.config_env.get_store_config("whoosh") self.base_index_path = config["path"] self.schema = self.get_schema() self.index = self.get_index(self.schema) self.default_plugins = [ FuzzyTermPlugin(), GtLtPlugin(), PhrasePlugin() ] self.default_pagenum = 1 self.default_pagelen = 20
def search_in_index(search_kw, index): ''' search_kw: ce que rentre l'utilisateur dans la barre de recherche index: l'index ouvert (objet ix dans le code qui précède) La fonction renvoie un dictionnaire avec pour clefs: - results: une liste contenant des dictionnaires. Chaque dictionnaire correspond à un résultat de recherche. Le premier élément de la liste est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment) - suggestions: dictionnaire de suggestion qui propose une éventuelle correction pour chaque mot entré par l'utilisateur. A voir comment on mélange les suggestions des différents mots pour fournir des suggestions complètes ''' #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu parser = MultifieldParser(["content", "title"], index.schema) #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts parser.add_plugin(FuzzyTermPlugin()) searcher = index.searcher() #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')]) myquery = parser.parse(to_parse) #on récupère les résultats pour pouvoir fermer le searcher par la suite r = searcher.search(myquery) results = [] for res in r: results.append({'title': res['title'], 'path': res['path']}) #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé corrector = searcher.corrector("content") suggestions = {} for kw in search_kw.split(' '): suggestions[kw] = corrector.suggest(kw) #on ferme le seacher searcher.close() return {'results': results, 'suggestions': suggestions}
id = Column(Integer, primary_key=True) title = Column(Text) body = Column(UnicodeText) created = Column(DateTime, default=datetime.datetime.utcnow()) def __repr__(self): return '{0}(title={1})'.format(self.__class__.__name__, self.title) Base.metadata.create_all(engine) index_subscriber = IndexSubscriber(session=session, whoosh_base_path='index') index_subscriber.subscribe(Post) p1 = Post(title='love barcelona', body='it is the best city in the world even before madrid!') p2 = Post(title='love madrid', body='it is the second best city in the world after barcelona!') session.add_all([p1, p2]) session.commit() # normal search, this does not keep whoosh score Post.whoosh.search('barcelona').all() Post.whoosh.search('madrid').all() # ordered result based on whoosh score results = Post.whoosh.search_all_ordered('madrid') results = Post.whoosh.search_all_ordered('barcelona') from whoosh.qparser import FuzzyTermPlugin Post.whoosh.search_all_ordered('baarcelonaa~2', plugin=FuzzyTermPlugin())
def main(): """ The main loop for the program """ g = Gui() ix = index.open_dir("indexdir") while True: event, values = g.window.read() g.window["_output_"]('') # close windows if event is None: break if event == '_SEARCH_' and values['TERM'] is not None: # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi qp = MultifieldParser( ["procTitle", "topics", "categories", "procContent"], termclass=Variations, schema=ix.schema, fieldboosts={ "procTitle": 1.5, "categories": 1.3 }) qp.add_plugin(FuzzyTermPlugin()) terms = str(values['TERM']) terms = terms.replace("title", "procTitle").replace("topic", "topics") \ .replace("category", "categories").replace("content", "procContent") # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui # ricercare i termini se richiesti. if values['syn_search']: with open("utils/wn_s.pl", "r") as f: thesaurus = Thesaurus.from_file(f) termsWithSynonyms = [] for term in terms.split(" "): field = None if ":" in term: field = term.split(":")[0] term = term.split(":")[1] if term not in booleanTokens: termSynonyms = thesaurus.synonyms(term) if field is not None: termSynonyms = [ f"{field}:{t}" for t in termSynonyms ] termSynonyms.append(f"{field}:{term}") else: termSynonyms.append(term) termsWithSynonyms.append(" OR ".join(termSynonyms)) else: termsWithSynonyms.append(term) terms = ' '.join(termsWithSynonyms) print("- Searching for >>> " + str(terms)) # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate words = terms.split(' ') stemmedWords = list() for word in words: stemmed = stem(word) if word != stemmed: stemmedWords.append(stemmed + '~') else: stemmedWords.append(stemmed) q = qp.parse(' '.join(stemmedWords)) with ix.searcher() as searcher: if not values['syn_search']: correction = searcher.correct_query(q=q, qstring=terms, maxdist=2) if terms != correction.string: print("- Did you mean >>> " + correction.string) results = searcher.search(q, terms=True) if not values['syn_search'] and results.is_empty(): print( "- No relevant result has been found for query, trying corrected query" ) results = searcher.search(qp.parse(correction.string)) numb = 1 if not results.is_empty(): for elem in results: # print(elem) print( f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n" f"\tLink to the page: {str(elem['pageUrl'])}\n") numb += 1 else: print("- No relevant result has been found")
def _criar_parser(): parser = QueryParser('nome', schema) parser.add_plugin(FuzzyTermPlugin()) return parser
def test_plugins_search(self): results = self.Post.whoosh.search_all_ordered('baarcelonaa~2', plugin=FuzzyTermPlugin()) self.assertEqual(len(results), 2) self.assertEqual(results[0].id, 1) self.assertEqual(results[1].id, 2)
# coding=utf-8 from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from whoosh.qparser import FuzzyTermPlugin idx_dir = 'lagou_idx' ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "desc"], schema=ix.schema) parser.add_plugin(FuzzyTermPlugin()) # Single field parser. k = u'搜索 OR Pythn~2 city:上海' q = parser.parse(k) results = searcher.search_page(q, 1, pagelen=5) print(u'{0} results found for keyword {1}, {2} returned: '.format( len(results), k, results.scored_length())) for hit in results[:50]: print(hit['id']) print(hit['name']) # print(hit['city']) print(hit['com_name']) print('************')