def search(self): self.searcher = ix_letter.searcher() fields = [] qs = u'index_letter:({0})'.format(self.word) fields.append("index_letter") self.query = MultifieldParser(fields, ix_letter.schema).parse(qs)
# Python Application which provides live (search as you type) MSID results import csv import os, os.path from whoosh import index from whoosh.qparser import MultifieldParser from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NGRAM, NGRAMWORDS from whoosh.qparser import QueryParser ## Constants NgramList = ('MSID','TECHNICAL_NAME','DESCRIPTION') # fields to search ## Input - what am I searching for? MyQuery = "PCA" MSID_index_dir = 'MSID_idx_7' # Relative to current path. ## Open Index ix = index.open_dir(MSID_index_dir) # TBD: add cmdline flag to set/use a particular index Searchable = ('MSID','TECHNICAL_NAME', 'DESCRIPTION') ## List of fieldnames to search on, others are qp = MultifieldParser(Searchable, schema=ix.schema) q = qp.parse(MyQuery) ix = index.open_dir(MSID_index_dir) with ix.searcher() as s: results = s.search(q) print(len(results)) print(results[0].keys) for res in results: print(res['MSID'] + ' - ' +res['TECHNICAL_NAME'])
from whoosh import * from whoosh.index import open_dir from whoosh.qparser import MultifieldParser from whoosh.fields import * import wikipedia index_path = r"C:\Users\Abhi\Downloads\Index" ix = open_dir(index_path) mparser = MultifieldParser(["title", "content"], schema=ix.schema) def search(): query = input("Hi How can I help you") q = mparser.parse(str(query)) with ix.searcher() as searcher: results = searcher.search(q, limit=20) for result in results: print(result['content']) print(wikipedia.summary(result['title'], sentences=2)) if __name__ == '__main__': search() '''if len(result) != 0: return render_template("search.html", results=result) else: return render_template("NotFound.html")'''
def parser(self, keyword, docType="all", sortType="None", fromDate="all time"): """ Searches an Index for documents. Searches the Index for documents containing the selected keywords, with the selected parameters and the selected sorting method. Parameters ---------- keyword : string Keywords to look for in the documents docType : string{'all',pdf','tex','py'} Type of document to look for. sortType: string{'None','By Number of ocurrences', 'By Date'} Sorting method to use. fromDate : string{'all time','this year', 'this month', 'this week'} Time interval to look for documents in. Returns ------- bool Result of the operation. array Documents found in the search. """ try: ix = open_dir(self.indexFolder) except EmptyIndexError: return [ False, ("The index provided does not exist, make sure you add it before using it and do not " + "delete it manually"), ] resultArray = [] keyword = analyzeText(unidecode.unidecode(keyword)) today = dt.now() date = "" if keyword == "": parseQuery = "" else: parseQuery = "content:" + keyword if fromDate != "all time": if fromDate == "this year": date = today - relativedelta(years=1) elif fromDate == "this month": date = today - relativedelta(months=1) elif fromDate == "this week": date = today - relativedelta(weeks=1) parseQuery = (parseQuery + " " + u"date:[" + date.strftime("%Y%m%d") + " to " + today.strftime("%Y%m%d") + "]") with ix.searcher() as searcher: if docType == "all": query = MultifieldParser(["content", "date"], schema=ix.schema).parse(parseQuery) else: query = MultifieldParser( ["content", "date", "tags"], schema=ix.schema).parse(parseQuery + " tags:" + docType) results = "" if sortType == "By Date": results = searcher.search(query, sortedby="date", reverse=True) elif sortType == "By Number of ocurrences": results = searcher.search(query, sortedby="nOccurrences", reverse=True) else: results = searcher.search(query) if results.is_empty(): return ( False, ("Não foram encontrados resultados com estes parâmetros de pesquisa" ), ) else: for result in results: # result["nOccurrences"] path = result["path"] tag = result["tags"] resultArray.append([path, tag]) return True, resultArray
results = [] # if __name__ == '__main__': # index_path = "./sindex" # else: # index_path = "./webspider/sindex" index_path = search_path try: ix = open_dir(index_path) # 搜索路径 except Exception, e: pass else: if not inputstring: pass else: publish_time = sorting.FieldFacet("publish_time", reverse=True) qp = MultifieldParser(["title", "body"], schema=ix.schema) with ix.searcher(weighting=scoring.TF_IDF()) as searcher: querystring = qp.parse(inputstring) results = searcher.search(querystring, terms=True, limit=None, sortedby=[publish_time]) # print(len(results)) # results = searcher.search_page(querystring, page) html_parser = HTMLParser.HTMLParser() if len(results) > 0: for i in xrange((page - 1) * size, page * size): if i in xrange(len(results)): tmpret = results[i].fields() hit_keywords = set() for key, val in results[i].matched_terms():
def _parser(fieldnames, schema, group, **kwargs): return MultifieldParser(fieldnames, schema, group=group, **kwargs)
def main(): args = parse_args() query = args.query number = args.number rank_func = args.rank_func index_loc = args.index_loc B = args.B weight_B = args.weight_B K1 = args.K1 if query is None: query_list = read_query() else: temp_str = ' ' query = temp_str.join(query) query_list = [query] if index_loc is None: index_loc = 'index' if weight_B is not None: rank_func = 1 if rank_func == 1: B1, B2, B3, B4, B5 = get_B(weight_B) weighting = scoring.BM25F(B=B, K1=K1, title_B=B1, body_B=B2, category_B=B3, date_B=B4, rating_B=B5) rank_name = 'bm25f' elif rank_func == 2: weighting = scoring.TF_IDF() rank_name = 'tf-idf' elif rank_func == 3: weighting = scoring.Frequency() rank_name = 'frequency' else: weighting = scoring.BM25F(B=B, K1=K1) rank_name = 'bm25' ix = open_dir(index_loc) with ix.searcher(weighting=weighting) as searcher: # parser = QueryParser(schema=ix.schema) parser = MultifieldParser( ['title', 'body', 'category', 'date', 'rating'], schema=ix.schema) for this_query in query_list: que = parser.parse(this_query) print('\n') print('--', this_query) results = searcher.search(que, limit=number) if len(results) == 0: print(' ') print('no matched result. please try again.') else: for hit in results: print(' ') print('#', hit.rank, rank_name, 'score:', round(hit.score, 10)) print('title:', hit['title']) print('imdb:', hit['imdbid'], 'date:', hit['date'], 'rating:', hit['rating'], 'category:', hit['category']) print('body:', hit['body'])
analyzer = chinese.ChineseAnalyzer() schema = Schema(title=TEXT(stored=True), sub_title=TEXT(stored=True), author=TEXT(stored=True), content=TEXT(stored=True, analyzer=analyzer)) storage = FileStorage("indexdir") ix = storage.open_index() writer = ix.writer() string = "桐花 樂團" normal = False with ix.searcher() as searcher: # og = qparser.OrGroup.factory(0.9) parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema) # parser = qparser.QueryParser("content", ix.schema) parser.remove_plugin_class(qparser.PhrasePlugin) parser.add_plugin(qparser.SequencePlugin()) if (normal): query = parser.parse(string) else: # proximity distance = 50 proximty_query = "\"" + string + "\"" + '~' + str((1 + distance) * 3) query = parser.parse(proximty_query) print(query) results = searcher.search(query) # Allow larger fragments
import whoosh.index from whoosh.qparser import MultifieldParser, OrGroup, WildcardPlugin whoosh_idx = whoosh.index.open_dir('whoosh_idx', indexname='nasdaq') query_parser = MultifieldParser(['title', 'article'], schema=whoosh_idx.schema, group=OrGroup) query_parser.remove_plugin_class(WildcardPlugin) parsed_query = query_parser.parse('What market does FitBit compete in?') with whoosh_idx.searcher() as searcher: search_results = searcher.search(parsed_query, limit=1) [print(sr['title']) for sr in search_results]