def parse_query(self, query, *fields, group="AND"): if group == "OR": parser = qparser.MultifieldParser(fields, schema=self.index.schema, group=qparser.OrGroup) else: parser = qparser.MultifieldParser(fields, schema=self.index.schema) q = parser.parse(query) return q
def render_results(s, qs, template): #qp = qparser.QueryParser("content", s.schema) qp = qparser.MultifieldParser(["tgrams", "title"], s.schema) # Add the DateParserPlugin to the parser qp.add_plugin(DateParserPlugin()) q = qp.parse(qs) results = s.search(q, limit=100, sortedby="title") #results = s.search(q, limit=100, sortedby="title", reverse=True) #results = s.search(q, limit=100, groupedby="chapter") q = results.q #hf = highlight.HtmlFormatter() #results.highlighter = highlight.Highlighter(formatter=hf) qc = None def hilite(hit): with open(SOURCEDIR + hit["path"], "rb") as hitfile: text = hitfile.read().decode("utf-8") return hit.highlights("content", text) return render_template(template, qs=qs, q=q, results=results, hilite=hilite, corrected=qc, args=request.args)
def search_content(self, search_statement: str): cur_lang = "en" try: lng = langdetect.detect(search_statement) cur_lang = lng if lng == "tr" else cur_lang except Exception as e: pass result_list = [] with self.__index.searcher() as searcher: query = qparser.MultifieldParser( ["message", "meta_content"] if cur_lang == "en" else ["message_tr", "meta_content_tr"], self.__schema, termclass=CustomFuzzyTerm).parse(search_statement) results = searcher.search(query) for r in results: result_list.append({ "_docnum": r.docnum, "message_id": r["message_id"], "chat_id": r["chat_id"], "message": r["message"], "meta_content": r["meta_content"] }) return result_list
def post_search(pn=1, size=10): keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空!') with whoosh_searcher.get_searcher('posts') as searcher: # q = query.Or([query.Term('title', keyword), query.Term('content', keyword)]) parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parse(keyword) result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] page = models.Page(pn, size, result=result_list, has_more=result.pagecount > pn, total_page=result.pagecount, total=result.total) print(page.result) # return jsonify(page) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def finalresult(): if request.method == 'POST': #search query query = request.form['QA'] print(query) results = [] ix = index.open_dir("qadata_Index") schema = ix.schema # Create query parser that looks through designated fields in index og = qparser.OrGroup.factory(0.9) mp = qparser.MultifieldParser(['question', 'answer'], schema, group=og) # This is the user query q = mp.parse(request.form['QA']) # Actual searcher, prints top 10 hits with ix.searcher(weighting=scoring.BM25F()) as s: results = s.search(q, limit=5) for i in range(5): print(results[i]['question'], str(results[i].score), results[i]['answer']) return render_template("result.html", searchquery=request.form['QA'], Q1=results[0]['question'], A1=results[0]['answer'], Q2=results[1]['question'], A2=results[1]['answer'], Q3=results[2]['question'], A3=results[2]['answer'], Q4=results[3]['question'], A4=results[3]['answer'], Q5=results[4]['question'], A5=results[4]['answer'])
def Rprecision(queries, ground_truth, top_5_table): ## This function takes the queries , which have the groun truth values ## For each Analyzer method and scoring function find the result queries ## In result queries, there are as much as length of ground_truth resulting query ## Finally it compares ground_truth with resul queries and save the resul for ## each Analyzer_Scoring rprecission values for each query stored in final_result_dict final_result_dict = {} for Analyzer, Scoring_Function in zip(top_5_table[0], top_5_table[1]): ix = index.open_dir(path + Analyzer + "Analyzer") scoring_function = getattr(scoring, Scoring_Function)() analyzer_result = {} for q_id in queries: query = queries[q_id] max_number_of_results = len(ground_truth[q_id]) fields = ix.schema.names() fields.remove('id') qp = qparser.MultifieldParser(fields, ix.schema) parsed_query = qp.parse(query) # qp = qparser.QueryParser("body", ix.schema) # parsed_query = qp.parse(query) # parsing the query searcher = ix.searcher(weighting=scoring_function) results = searcher.search(parsed_query, limit=max_number_of_results) final_result = [] for result in results: if int(result["id"]) in gt[q_id]: final_result.append(1) analyzer_result[q_id] = sum(final_result) / len(ground_truth[q_id]) final_result_dict["_".join([Analyzer, Scoring_Function])] = analyzer_result searcher.close() return final_result_dict
def CreateQueryParser(): global qp og = qparser.OrGroup.factory(0.9) qp = qparser.MultifieldParser(["Title", "ParaText"], schema=ix.schema, group=og) qp.add_plugin(FuzzyTermPlugin())
def search(self, phrase, content=False, username=None, ct=10, page=0): """Implement the search, returning a list of bookmarks""" page = int(page) + 1 with WIX.searcher() as search: fields = ['description', 'extended', 'tags'] if content: fields.append('readable') parser = qparser.MultifieldParser(fields, schema=WIX.schema, group=qparser.OrGroup) qry = parser.parse(phrase) try: res = search.search_page(qry, page, pagelen=int(ct)) except ValueError, exc: raise(exc) if res: qry = Bmark.query.filter( Bmark.bid.in_([r['bid'] for r in res]) ) if username: qry = qry.filter(Bmark.username == username) qry = qry.options(joinedload('hashed')) return qry.all() else: return []
def search_index(search_model, query, fields=[], limit=None): ix = index.open_dir(search_model.get_path()) fields = fields or search_model.fields hits = [] query = smart_unicode(query) limit = limit or getattr(settings, 'DJOOSH_SEARCH_LIMIT', 100) if query and fields: query = query.replace('+', ' AND ').replace('|', ' OR ') parser = qparser.MultifieldParser(fields, schema=ix.schema) qry = parser.parse(query) try: qry = parser.parse(query) except: qry = None if qry: searcher = ix.searcher() try: hits = searcher.search(qry, limit=limit) except: hits = [] ix.close() return hits
def post_search(pn=1, size=10): """ :param pn: :param size: :return: """ keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空') with whoosh_searcher.get_searcher('posts') as searcher: parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parser(keyword) result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] page = models.Page(page_num=pn, per_page=size, result_list=result_list, has_more=result.pagecount > pn, total_page=result.pagecount, total=result.total) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def test_missing_field_scoring(): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = RamStorage() ix = storage.create_index(schema) writer = ix.writer() writer.add_document(name=u('Frank'), hobbies=u('baseball, basketball')) writer.commit() r = ix.reader() assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 1) r.close() writer = ix.writer() writer.add_document(name=u('Jonny')) writer.commit() with ix.searcher() as s: r = s.reader() assert_equal(len(ix._segments()), 1) assert_equal(r.field_length("hobbies"), 2) assert_equal(r.field_length("name"), 2) parser = qparser.MultifieldParser(['name', 'hobbies'], schema) q = parser.parse(u("baseball")) result = s.search(q) assert_equal(len(result), 1)
def __init__(self, index=None, search_fields=["title", "content"], html_formatter=None, parser=None, termclass=Term): """Clase para buscar por distintos fields :param: index :type: whoosh.index.Index - Instancia del objeto Index :param: search_fields - Lista de los campos donde se busca :type: list :param: html_formatter - Instancia que formatea los hits :type: whoosh.highlight.HtmlFormatter """ self.index = index or open_dir(INDEX_DIR) self.html_formatter = html_formatter or HtmlFormatter( between="...", tagname="strong", classname="search-match", termclass="search-term") self.search_fields = search_fields self.termclass = termclass self.parser = parser or qparser.MultifieldParser( self.search_fields, self.index.schema, termclass=termclass)
def search(self, query_str, limit=20): schema = self.get_schema() idx = self.get_index() query = qparser.MultifieldParser( self.setting.searchable_columns, schema=schema, ).parse(query_str) with idx.searcher() as searcher: result = [hit.fields() for hit in searcher.search(query, limit=limit)] return result
def parse(filt): if filt.query_type == Filter.Q_APPROX: mp = qparser.MultifieldParser(filt.get_fields(), schema=schema) return mp.parse(unicode(filt.query_string)) elif filt.query_type == Filter.Q_EXACT: s = cls.get_index().searcher() qs = filt.query_string f = lambda d: qs in [ d.get(field) for field in filt.get_fields() ] ids = [unicode(d['id']) for d in filter(f, s.documents())] return query.Or([query.Term('id', iden) for iden in ids])
def search(self, string=None, fields=["title", "content"]): query_parser = qparser.MultifieldParser(fields, self.ix.schema, group=qparser.OrGroup) query_parser.remove_plugin_class(qparser.PhrasePlugin) query_parser.add_plugin(qparser.FuzzyTermPlugin()) query_parser.add_plugin(qparser.SequencePlugin()) with self.ix.searcher(weighting=scoring.BM25F) as searcher: pattern = query_parser.parse(u'"{}"'.format(string)) for result in searcher.search(pattern, limit=None): yield result
def search_query(request, response_format='html'): """Account view""" objects = [] query = request.GET.get('q', '') if query: if query[:5] == 'tags:': tag_names = query[5:].strip().split(',') tags = Tag.objects.filter(name__in=tag_names) objects = Object.objects.filter(tags__in=tags) else: search_engine = getattr(settings, 'SEARCH_ENGINE', 'whoosh') if search_engine == 'whoosh': ix = index.open_dir(settings.WHOOSH_INDEX) # Whoosh doesn't understand '+' or '-' but we can replace # them with 'AND' and 'NOT'. squery = query.replace('+', ' AND ').replace('|', ' OR ').replace( ' ', ' OR ') parser = qparser.MultifieldParser( ["name", "url", "type", "content"], schema=ix.schema) qry = parser.parse(squery) try: qry = parser.parse(squery) except: # don't show the user weird errors only because we don't # understand the query. # parser.parse("") would return None qry = None if qry: searcher = ix.searcher() try: hits = searcher.search(qry, limit=100) except: hits = [] hit_ids = [hit['id'] for hit in hits] objects = Object.objects.filter(pk__in=hit_ids) elif search_engine == 'db': objects = dbsearch.search(query) else: raise RuntimeError( 'Unknown Search engine: {0!s}'.format(search_engine)) return render_to_response('core/search/query_view', { 'query': query, 'objects': objects }, context_instance=RequestContext(request), response_format=response_format)
def _mk_parser(self): from whoosh import qparser as qparse parser = qparse.MultifieldParser(self.idx_obj.schema.names(), self.idx_obj.schema) # XXX: plugin is broken in Debian's whoosh 2.7.0-2, but already fixed # upstream parser.add_plugin(qparse.FuzzyTermPlugin()) parser.add_plugin(qparse.GtLtPlugin()) parser.add_plugin(qparse.SingleQuotePlugin()) # replace field defintion to allow for colons to be part of a field's name: parser.replace_plugin( qparse.FieldsPlugin(expr=r"(?P<text>[()<>.\w]+|[*]):")) self.parser = parser
def index_search(dirname, search_fields, search_query): ix = index.open_dir(dirname) sch = ix.schema # Create query parser that looks through designated fields in index og = qparser.OrGroup.factory(0.9) mp = qparser.MultifieldParser(search_fields, sch, group=og) # This is the user query q = mp.parse(search_query) # Actual searcher, prints top 10 hits with ix.searcher() as s: results = s.search(q, limit=10) print("Search Results: ") pprint(results[0:10])
def index_search(dirname, search_fields, search_query): ix = index.open_dir(dirname) schema = ix.schema og = qparser.OrGroup.factory(0.9) mp = qparser.MultifieldParser(search_fields, schema, group = og) q = mp.parse(search_query) with ix.searcher() as s: results = s.search(q, terms=True, limit = return_results_count) print("Completing Whoosh Search") tmp_df = pd.concat([pd.DataFrame([[hit['path']]], columns=['path']) for hit in results], ignore_index=True) return(tmp_df)
def searchDataByNameAndType(name, type): ix = open_dir("indexdir") query_str = name + '* ' + type with ix.searcher(weighting=scoring.Frequency) as searcher: query = qparser.MultifieldParser(["title", "extension"], ix.schema).parse(query_str) results = searcher.search(query, limit=None) if len(results) > 0: for i in range(len(results)): print("File Name: " + results[i]['title'], "Path: " + results[i]['path'], "Extension: " + results[i]['extension']) else: print("Aucun resultat trouvé !")
def search_clips(query, page): ''' search_clips returns the clips found by the given query Clips are stored in a named tuple called ClipSearchResults. ClipSearchResults has two fields: - clips: a list of Clip objects on the page - length: the total number of clips that match the query ''' parser = qparser.MultifieldParser(['title', 'description', 'tags', 'user'], clip_index.schema) with clip_index.searcher() as searcher: results = searcher.search_page(parser.parse(query), page, pagelen=cn.SEARCH_CLIPS_PER_PAGE) clips = [] for result in results: clips.append(Clip.query.get(result['id'])) ClipSearchResults = namedtuple('ClipSearchResults', ['clips', 'length']) return ClipSearchResults(clips, len(results))
def search_index(self, search_term): model_index = self._get_index() schema = self._get_schema() fields = list() for field in self._get_indexable_columns(): if field == "id": field = "model_id" value = getattr(self, field) # Do not search the primary key if not value.primary_key: fields.append(field) parser = qparser.MultifieldParser(fields, schema) query = parser.parse(search_term) with model_index.searcher() as searcher: results = searcher.search(query) for result in results: yield result
def validate_question(text): """Check validity of the question: 1. filter by length 2. auto-correction 3. OOV""" global valid, ix # empty or too long questions are not allowed if len(text) == 0: error = "<div class=\"alert alert-warning\"> Sorry, the question appears to be empty. Try again? </div>" return False, error elif len(text) > 150: error = "<div class=\"alert alert-warning\"> Sorry, the question is too long. Try to use only 150 characters." \ " </div>" return False, error mparser = qparser.MultifieldParser(["answer"], schema=ix.schema) # auto-correction built in Whoosh with ix.searcher() as s: q = mparser.parse(text.replace("?", "")) corrected = s.correct_query(q, text) if corrected.query != q: error = "<div class=\"alert alert-warning\"> Did you mean: <a href=\"" + url_for( 'passage.process_question', received_question=corrected.string ) + "\">" + corrected.string + "</a>?</div>" return False, error # the question is valid, but contains lemmata which are not in vocabulary, so a warning is displayed oov_num = 0 for word in text.replace("?", "").split(): if not (lemmatizer.lemmatize(word, pos='v') in vocabulary_encoded.keys()): if not (lemmatizer.lemmatize( word.lower(), pos='v')) in vocabulary_encoded.keys(): oov_num += 1 if oov_num != 0: if oov_num < len(text.split()): warning = "<div class=\"alert alert-warning\"> The question has words that are not in vocabulary." \ " if you rephrase it, you might get better results. </div>" return True, warning else: error = "<div class=\"alert alert-warning\"> Sorry, could not understand your input." \ " </div>" return False, error return True, ""
def test_boolean_multifield(): schema = fields.Schema(name=fields.TEXT(stored=True), bit=fields.BOOLEAN(stored=True)) ix = RamStorage().create_index(schema) with ix.writer() as w: w.add_document(name=u('audi'), bit=True) w.add_document(name=u('vw'), bit=False) w.add_document(name=u('porsche'), bit=False) w.add_document(name=u('ferrari'), bit=True) w.add_document(name=u('citroen'), bit=False) with ix.searcher() as s: qp = qparser.MultifieldParser(["name", "bit"], schema) q = qp.parse(u("boop")) r = s.search(q) assert sorted(hit["name"] for hit in r) == ["audi", "ferrari"] assert len(r) == 2
def index_search(dirname, search_fields, search_query): ix = index.open_dir(dirname) schema = ix.schema # Create query parser that looks through designated fields in index og = qparser.OrGroup.factory(0.9) mp = qparser.MultifieldParser(search_fields, schema, group = og) # This is the user query q = mp.parse(search_query) # Actual searcher, prints top 10 hits with ix.searcher() as s: results = s.search(q, limit = None) print("Total Documents: ",ix.doc_count_all()) print("Retrieved Documents: ",results.estimated_length()) print(results._get_scorer()) for i,result in enumerate(results[0:5]): print("Search Results: ",result.rank,"Score: ",result.score) print("Question: ",result['question']) print("Answer: ",result['answer']) print("------------------------")
def search(query, page=1, per_page=20): with index.searcher() as s: qp = qparser.MultifieldParser(['title', 'content'], index.schema) q = qp.parse(unicode(query)) try: result_page = s.search_page(q, page, pagelen=per_page) except ValueError: if page == 1: return SearchResultPage(None, page) return None results = result_page.results results.highlighter.fragmenter.maxchars = 512 results.highlighter.fragmenter.surround = 40 results.highlighter.formatter = highlight.HtmlFormatter( 'em', classname='search-match', termclass='search-term', between=u'<span class=ellipsis> … </span>') return SearchResultPage(result_page, page)
def post_search(pn=1, size=10): keyword = request.values.get('kw') if keyword is None: return render_template('search/list.html', title='搜索', message='搜索关键字不能为空!') whoosh_searcher.clear('posts') writer = whoosh_searcher.get_writer('posts') for item in mongo.db['posts'].find( {}, ['_id', 'title', 'content', 'create_at', 'user_id', 'catalog_id']): item['obj_id'] = str(item['_id']) item['user_id'] = str(item['user_id']) item['catalog_id'] = str(item['catalog_id']) item.pop('_id') writer.add_document(**item) # 保存修改 writer.commit() with whoosh_searcher.get_searcher('posts') as searcher: # 解析查询字符串 parser = qparser.MultifieldParser( ['title', 'content'], whoosh_searcher.get_index('posts').schema) q = parser.parse(keyword) print('q:', q) # 搜索得到结果 result = searcher.search_page(q, pagenum=pn, pagelen=size, sortedby=sorting.ScoreFacet()) result_list = [x.fields() for x in result.results] # 构建页面对象 page = Page(pn, size, result=result_list, has_more=result.pagecount > pn, page_count=result.pagecount, total=result.total) return render_template('search/list.html', title=keyword + '搜索结果', page=page, kw=keyword)
def query_index(q, offset, limit): ix = index.open_dir('index') products = [] with ix.searcher() as searcher: mp = qparser.MultifieldParser(['name', 'description'], ix.schema) mpq = mp.parse(q) results = searcher.search_page(mpq, pagenum=offset + 1, pagelen=limit) for result in results: pprint(result) products.append({ 'id': result['ID'], 'image': result['image'], 'name': result['name'], 'description': result['description'], 'price': result['price'] }) return (products, len(results))
def test_missing_field_scoring(self): schema = fields.Schema(name=fields.TEXT(stored=True), hobbies=fields.TEXT(stored=True)) storage = store.RamStorage() idx = index.Index(storage, schema, create=True) writer = idx.writer() writer.add_document(name=u'Frank', hobbies=u'baseball, basketball') writer.commit() self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 1) # name writer = idx.writer() writer.add_document(name=u'Jonny') writer.commit() self.assertEqual(len(idx.segments), 1) self.assertEqual(idx.segments[0].field_length(0), 2) # hobbies self.assertEqual(idx.segments[0].field_length(1), 2) # name parser = qparser.MultifieldParser(['name', 'hobbies'], schema=schema) searcher = idx.searcher() result = searcher.search(parser.parse(u'baseball')) self.assertEqual(len(result), 1)
def question_tokens_to_query(keywords): """ From a list of keywords and its synonym, transform to whoosh-defined query format """ # Build query from keywords query_str = "" for keyword in keywords: keywords_str = "(" for i in range(len(keyword)): keywords_str += keyword[i] + " OR " keywords_str = keywords_str[:-4] # Remove the last " OR " keywords_str += ")" query_str += keywords_str + " " # From query string build whoosh-defined query ix = index.open_dir(index_dir) parser = qparser.MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) # For complex pharse query parser.add_plugin(qparser.FuzzyTermPlugin() ) # Search for term that dont have to match exactly query = parser.parse(query_str) return query