def searchByRank(entry): subtop1 = Tkinter.Tk() scrollbarY = Tkinter.Scrollbar(subtop1) scrollbarX = Tkinter.Scrollbar(subtop1, orient=Tkinter.HORIZONTAL) scrollbarY.pack(side=Tkinter.RIGHT, fill=Tkinter.Y) scrollbarX.pack(side=Tkinter.BOTTOM, fill=Tkinter.X) LB1 = Tkinter.Listbox(subtop1, width=80, height=12, yscrollcommand=scrollbarY.set, xscrollcommand=scrollbarX.set) scrollbarY.config(command=LB1.yview) scrollbarX.config(command=LB1.xview) fechas = entry.split("-") flinferior = fechas[0] flisuperior = fechas[1] qp = QueryParser('date', schema=ix.schema) q = qp.parse(u"date:[" + flinferior + " to " + flisuperior + "]") with ix.searcher() as s: results = s.search(q) i = 1 pos = 1 for result in results: LB1.insert(pos, "Correo " + str(i)) pos += 1 LB1.insert(pos, "Remitentes: " + result['mailFrom']) pos += 1 LB1.insert(pos, "Destinatarios: " + result['mailTo']) pos += 1 LB1.insert(pos, "Subject: " + result['subject']) pos += 1 LB1.insert(pos, "\n") pos += 1 i += 1 LB1.insert(pos, "En este rango de fechas se han enviado " + str(i - 1) + " correos.") pos += 1 LB1.pack() subtop1.mainloop()
def get(self): wikiResults = None jobResults = None projectResults = None if 'searchScope' in request.args and 'searchTerm' in request.args: searchTerm = request.args.get('searchTerm') searchScope = request.args.get('searchScope') index = open_dir('app/search/index') parser = QueryParser("content", schema=index.schema) with index.searcher() as searcher: if searchScope in ['everything', 'wiki']: wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI'] if searchScope in ['everything', 'jobs']: jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB'] if searchScope in ['everything', 'projects']: projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT'] else: searchTerm = '' searchScope = 'everything' return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")
def __call__(self, query): """search""" query = unicode(query) query_parser = QueryParser("description", schema=self.ix.schema) myquery = query_parser.parse(query) # Old code: too strict # extendedquery = Or([myquery] + # [Term(field, query) for field in self.keywords]) # New code: too permissive # extendedquery = [myquery] excluded = set(["AND", "OR", "NOT"]) terms = [i for i in query.split() if i not in excluded] # for field in self.keywords: # extendedquery.extend([Term(field, term) for term in terms]) # extendedquery = Or(extendedquery) # Code should look something like # Or([myquery] + [Or( # extendedquery = [myquery] extendedquery = And( [ Or( [myquery] + [Term("description", term), Term("name", term)] + [Term(field, term) for field in self.keywords] ) for term in terms ] ) # perform the search searcher = self.ix.searcher() return [i["name"] for i in searcher.search(extendedquery, limit=None)]
def numeroDeCorreos2(var): with ix.searcher() as searcher: query = QueryParser("name", ix2.schema) qp = query.parse(unicode(var)) with ix2.searcher() as s: results = s.search(qp) mail = results[0]["mail"] query = QueryParser("mailTo", ix.schema).parse(mail) results = searcher.search(query) panel = Tkinter.Toplevel() scrollbar = Tkinter.Scrollbar(panel) scrollbar.pack(side=Tkinter.RIGHT, fill=Tkinter.Y) listado = Tkinter.Text(panel, width=150, height=30, yscrollcommand=scrollbar.set) i = 1 for result in results: listado.insert(Tkinter.INSERT, "Mail from: " + result["mailFrom"]) listado.insert(Tkinter.INSERT, "Mail to: " + result["mailTo"]) listado.insert(Tkinter.INSERT, "Subject: " + result["subject"]) date = result["date"] listado.insert(Tkinter.INSERT, "Date: " + date[:4] + "-" + date[4:6] + "-" + date[6:]) listado.insert(Tkinter.INSERT, "Content: ") content = re.findall("'([^']*)'", result["content"]) last = content[-1] i += 1 for line in content: if line is not last: line = line[:-2] listado.insert(Tkinter.INSERT, line + "\n") listado.insert(Tkinter.INSERT, "\n") listado.insert(Tkinter.INSERT, "Este remitente ha enviado " + str(i - 1) + " correos.") scrollbar.config(command=listado.yview) listado.pack()
def test_wildcard_existing_terms(): s = fields.Schema(key=fields.ID, value=fields.TEXT) ix = RamStorage().create_index(s) w = ix.writer() w.add_document(key=u("a"), value=u("alfa bravo bear charlie delta")) w.add_document(key=u("a"), value=u("boggle echo render rendering renders")) w.commit() r = ix.reader() qp = QueryParser("value", ix.schema) def words(terms): z = [] for t in terms: assert t[0] == "value" z.append(t[1]) return " ".join(sorted(z)) q = qp.parse(u("b*")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "bear boggle bravo") q = qp.parse(u("[a TO f]")) ts = q.existing_terms(r) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "alfa bear boggle bravo charlie delta echo") q = query.Variations("value", "render") ts = q.existing_terms(r, expand=False) assert_equal(ts, set()) ts = q.existing_terms(r, expand=True) assert_equal(words(ts), "render rendering renders")
def finddocs(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True): ix = open_dir(indexdir) res=[] daycount={} with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) myquery = parser.parse(query) if distribution: myfacet=Facets().add_field("date",maptype=sorting.UnorderedList) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) else: results=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) doc_cnt=0 for day,docs in results.groups().iteritems(): daycount[day]=len(docs) for result in docs: if doc_cnt in range((page-1)*ndocs,page*ndocs): res.append({'title':searcher.stored_fields(result)['title'],'identifier':searcher.stored_fields(result)['identifier'],'date':searcher.stored_fields(result)['date']}) doc_cnt+=1 total_docs=results.estimated_length() return res, total_docs, daycount else: if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results=searcher.search(datequery & myquery, limit=MAX_SEARCH_RESULTS) else: results=searcher.search(myquery, limit=MAX_SEARCH_RESULTS) for result in results[(page-1)*ndocs:page*ndocs]: res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date']}) total_docs=results.estimated_length() return res, total_docs
def searchNote(self): """ Sorting criteria: "title > path > content" Search matches are organized into html source. """ pattern = self.searchEdit.text() if not pattern: return results = [] print("Searching using", pattern) with self.ix.searcher() as searcher: matches = [] for f in ["title", "path", "content"]: queryp = QueryParser(f, self.ix.schema) queryp.add_plugin(RegexPlugin()) # r"pattern" is the desired regex term format query = queryp.parse('r"' + pattern + '"') ms = searcher.search(query, limit=None) # default limit is 10! for m in ms: if not m in matches: matches.append(m) for r in matches: title = r['title'] path = r['path'] term = r.highlights("content") results.append([title, path, term]) html = "" for title, path, hi in results: html += ("<p><a href='" + path + "'>" + title + "</a><br/><span class='path'>" + path + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html) print("Finished searching", pattern)
def get_answer(message): if '/' in message[0]: return None rx = r'jova,?\s(.+)$' m = re.match(rx, message) if not m or len(m.groups(1)) < 1: return None global ix search_terms = m.groups(1)[0] parser = QueryParser("content", ix.schema) qry = parser.parse(search_terms) with ix.searcher() as searcher: results = searcher.search(qry) result = None if len(results) == 0: return None if len(results) == 1: result = results[0] else: result = random.choice(results) if result is None or 'path' not in result: return None return result['path'], 'plain-text' return None
def findsnippets(query, daterange=None, page=1,ndocs=PER_PAGE, MAX_SEARCH_RESULTS=MAX_SEARCH_RESULTS,distribution=True): ix = open_dir(indexdir) res=[] daycount={} with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) myquery = parser.parse(query) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) results = searcher.search(datequery & myquery,limit=MAX_SEARCH_RESULTS) else: results = searcher.search(myquery,limit=MAX_SEARCH_RESULTS) if distribution: myfacet=Facets().add_field("date",maptype=sorting.Count) if daterange!=None: datequery=DateRange("date", daterange[0],daterange[1]) daycount_orig=searcher.search(datequery & myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) else: daycount_orig=searcher.search(myquery, groupedby=myfacet,limit=MAX_SEARCH_RESULTS) for day in daycount_orig.groups(): daycount[day]=daycount_orig.groups()[day] for result in results[(page-1)*ndocs:page*ndocs]: doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d"))) snippet=result.highlights("content", text=doc.getcontent()) res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet}) total_docs=results.estimated_length() return res, total_docs, daycount else: for result in results[(page-1)*ndocs:page*ndocs]: doc=PoorDoc(docidentifier=result['identifier'],date=int(result['date'].strftime("%Y%m%d"))) snippet=result.highlights("content", text=doc.getcontent()) res.append({'title':result['title'],'identifier':result['identifier'],'date':result['date'],'snippet':snippet}) total_docs=results.estimated_length() return res, total_docs
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(spelling=True), b=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(a=u("alfa bravo charlie delta")) w.add_document(a=u("delta echo foxtrot golf")) w.add_document(a=u("golf hotel india juliet")) w.add_document(a=u("juliet kilo lima mike")) w.commit() s = ix.searcher() qp = QueryParser("a", ix.schema) qtext = u('alpha ("brovo november" OR b:dolta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND (a:"bravo november" OR b:dolta) AND a:detail)' assert c.string == 'alfa ("bravo november" OR b:dolta) detail' qtext = u('alpha b:("brovo november" a:delta) detail') q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
def populateTable(self, searchterm=None): self.infoTable.clear() self.infoTable.setHorizontalHeaderLabels( [ "Title", "Authors", "Tags", "Year", "Read" ] ) self.infoTable.setRowCount(0) self.infoTable.horizontalHeader().setResizeMode(0, QHeaderView.Stretch) self.infoTable.verticalHeader().hide() if( searchterm == None or searchterm == "" ): papers = KKDocument.objects.all() for p in papers: a = ', '.join([x.name for x in p.authors.all()]) t = ', '.join([x.tag for x in p.tags.all()]) self.newEntry(p.title, a, t, p.year, p) return # were done here - all papers printed # only if there is a searchterm: # search full text with whoosh print "FINDING %s" % searchterm searcher = self.whoosh_ix.searcher() parser = QueryParser("content", schema = self.whoosh_schema) query = parser.parse(unicode(searchterm)) whoosh_results = searcher.search(query) print "FOUND", len(whoosh_results), "Objects" for r in whoosh_results: p = KKDocument.objects.get(localFile=r['path']) a = ', '.join([x.name for x in p.authors.all()]) t = ', '.join([x.tag for x in p.tags.all()]) self.newEntry(p.title, a, t, p.year, p)
def grid_search(rookie_avg, surround, fragment_char_limit, whoosh_results, corpus, query_string): ''' find best top parameter for whoosh snips. top parameter controls how many ... delimited fragments best = minimize distance w/ average size of rookie snip ''' best = {""} best_distance_so_far = 1000000000 index = open_dir("indexes/{}/".format(corpus)) corpusid = getcorpusid(corpus) for top in range(1, 5): # basically fixed for now with index.searcher() as srch: query_parser = QueryParser("content", schema=index.schema) qry = query_parser.parse(query_string) results = srch.search(qry, limit=None) results.fragmenter.surround = surround results.fragmenter.maxchars = fragment_char_limit sum_ = 0 for s_ix, a in enumerate(results): path = a.get("path").replace("/", "") sents = get_preproc_sentences(path, corpusid) sss = unicode(" ".join(sents).encode("ascii", "ignore")) sss = str(a.highlights("content", text=sss, top=top)) sum_ += len(sss) diff = abs(rookie_avg - sum_/len(results)) print "top of {} gives diff of {}".format(top, diff) if diff < best_distance_so_far: best = top best_distance_so_far = diff print "best top = {}".format(best) return best
def crearEsquemaCorreo(): correo1 = "1.txt" correoEsquema = Schema(remitente=ID(stored=True), destinatarios=KEYWORD(stored=True), fecha=DATETIME(stored=True),asunto=KEYWORD(stored=True), cuerpo=TEXT(stored=True)) if not os.path.exists("indexCorreo"): os.mkdir("indexCorreo") iC = index.create_in("indexCorreo", correoEsquema) iC = open_dir("indexCorreo") writer = iC.writer() fecha = "20101015" date_email = datetime.strptime(fecha, "%Y%m%d") writer.add_document(remitente=u"unoarrobagmail.com", destinatarios=u"dosarrobagmail.com tresarrobagmail.com", fecha=date_email,asunto=u"Contrato de compraventa con la constructora",cuerpo=u"Estimados socios: ya hemos firmado el contrato de compraventa con el cliente preferencial. Espero noticias vuestras. Un saludo,") #writer.add_document(email=u"dosarrobagmail.com", name=u"Pedro Guerra") #writer.add_document(email=u"tresarrobagmail.com", name=u"Ana Montero") #writer.add_document(email=u"cuatroarrobagmail.com", name=u"Luis Pontes") writer.commit() qp = QueryParser("remitente", schema=iC.schema) q = qp.parse(u"unoarrobagmail.com") with iC.searcher() as s: results = s.search(q) print results[0]
def search_datasets(self, search_phrase, limit=None): """Search for just the datasets.""" from collections import defaultdict from whoosh.qparser import QueryParser parser = QueryParser("doc", schema=self.dataset_index.schema) query = parser.parse(search_phrase) datasets = defaultdict(SearchResult) with self.dataset_index.searcher() as searcher: results = searcher.search(query, limit=limit) for hit in results: vid = hit.get('vid') bvid = hit.get('bvid') type = hit.get('type') datasets[bvid].vid = bvid if type == 'b': datasets[bvid].bundle_found = True datasets[bvid].b_score += hit.score else: datasets[bvid].p_score += hit.score datasets[bvid].partitions.add(vid) return datasets
def OnlyOneSearch(queryStr="",index=".index"): ix=get_index(index) searcher = ix.searcher() parser = QueryParser("name", schema = ix.schema) query=parser.parse(queryStr) results = searcher.search(query) return results
def stage3(): ix = open_dir(index_directory) if not ix: print "No index" return parser = QueryParser("content", ix.schema) with ix.searcher() as searcher: try: while True: search_phrase = raw_input('Search phrase: ') if not search_phrase: continue search_phrase = search_phrase.decode(sys.stdin.encoding) myquery = parser.parse(search_phrase) results = searcher.search(myquery) if results: for result in results: print "%s - %s (%s)" % (result['url'],result['title'], result['company']) else: print "No matching results" print "\r\n" except KeyboardInterrupt: print "\nBae..." return
def index_query(environ, **kwargs): """ Return a generator of tiddlers that match the provided arguments. """ logging.debug('entering with %s', environ) print 'getting called on index_query' config = environ['tiddlyweb.config'] #store = environ['tiddlyweb.store'] query_parts = [] for field, value in kwargs.items(): if field == 'tag': field = 'tags' query_parts.append('%s:%s' % (field, value)) query_string = ' '.join(query_parts) print 'getting inside on index_query' schema = config.get('wsearch.schema', SEARCH_DEFAULTS['wsearch.schema']) searcher = get_searcher(config) parser = QueryParser('text', schema=Schema(**schema)) query = parser.parse(query_string) logging.debug('query parsed to %s' % query) results = searcher.search(query) def tiddler_from_result(result): print 'r', result bag, title = result['id'].split(':', 1) tiddler = Tiddler(title, bag) return tiddler #return store.get(tiddler) for result in results: yield tiddler_from_result(result) return
def search(q, default_field="content"): ix = index.open_dir(SEARCH_INDEX) searcher = ix.searcher() parser = QueryParser(default_field, schema=ix.schema) query = parser.parse(q) results = searcher.search(query) return results
def search(request): hits = [] results = [] query = request.GET.get('q', None) newspaper = request.GET.get('newspaper', None) if newspaper is not None: index_dir = "C:/Django Projects/searcher/modules/index" + newspaper ix = index.open_dir(index_dir) searcher = ix.searcher() if query is not None and query != u"": query = query.replace('+', ' AND ').replace(' -', ' NOT ') parser = QueryParser("content", schema=ix.schema) try: qry = parser.parse(query) except: qry = None if qry is not None: hits = searcher.search(qry) for hit in hits: title = hit['title'] url = hit['url'] date = hit['date'] highlights = hit.highlights("content") keywords_list = [keyword for keyword, score in searcher.key_terms_from_text("content", hit['content'])] keywords = ", ".join(keywords_list) results.append(Result(title,url,date,highlights,keywords)) variables = RequestContext(request, { 'query': query, 'hits': results }) return render_to_response('search.html', variables)
def test_correct_spell_field(): ana = analysis.StemmingAnalyzer() schema = fields.Schema(text=fields.TEXT(analyzer=ana, spelling=True)) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(text=u"rendering shading modeling reactions") with ix.searcher() as s: text = s.schema["text"] spell_text = s.schema["spell_text"] r = s.reader() words = [text.from_bytes(t) for t in r.lexicon("text")] assert words == ["model", "reaction", "render", "shade"] words = [spell_text.from_bytes(t) for t in r.lexicon("spell_text")] assert words == ["modeling", "reactions", "rendering", "shading"] qp = QueryParser("text", s.schema) qtext = u"renderink" q = qp.parse(qtext, s.schema) r = s.search(q) assert len(r) == 0 c = s.correct_query(q, qtext) assert c.string == "rendering" assert c.query == query.Term("text", "rendering") hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">rendering</strong>'
def search_files(index_dir, content): """ search file content in index if not hit: return False if hit: return results """ index_exist = index.exists_in(index_dir) if not index_exist: print ("index not exist") return False ix = index.open_dir(index_dir) content = unicode(content) with ix.searcher() as searcher: parser = QueryParser("content", ix.schema) query = parser.parse(content) # whoosh.searching.Results results = searcher.search(query) print (type(results)) l = len(results) print l for h in results: # whoosh.searching.Hit print type(h) print h return results return False
def test_all_terms(): q = QueryParser("a", None).parse(u('hello b:there c:"my friend"')) ts = q.all_terms(phrases=False) assert sorted(ts) == [("a", "hello"), ("b", "there")] ts = q.all_terms(phrases=True) assert sorted(ts) == [("a", "hello"), ("b", "there"), ("c", "friend"), ("c", "my")]
def whooshSearch(searchTerm, brandTerm): brandTerm = brandTerm.lower() #Open index ix = index.open_dir("index") #Identify which field parserFood = QueryParser("tag", ix.schema) queryFood = parserFood.parse(searchTerm) searcher = ix.searcher() #Create filter if brandTerm == "": brandTerm = "general" allow_q = Term("source", brandTerm) resultFoodGen = searcher.search(queryFood,filter = allow_q, limit = 7) resultFoodNoGen = searcher.search(queryFood,mask = allow_q, limit = 1000) resultFood = [each for each in resultFoodGen] + [each for each in resultFoodNoGen] else: allow_q = Term("source", brandTerm) resultFood = searcher.search(queryFood,filter = allow_q, limit = 1000) resultList = [] for i in resultFood: resultList.append(i["id"]) searcher.close() return resultList
def whoosh(q_query, raw_query, page): results = [] try: import internal_search with internal_search.ix.searcher() as searcher: from whoosh.qparser import QueryParser parser = QueryParser("keywords", internal_search.ix.schema) myquery = parser.parse('"%s"' %raw_query) w_results = searcher.search(myquery) for r in w_results: s = '' if 'official' in r: s = ' official' if not 'twitter' in r: t = '' else: t = r['twitter'] results.append({ "style" : "internal_search%s" % s, "title" : r['title'], "url" : r['link'], "snippet" : r['content'], "display_url" : r['link'], "twitter" : t }) except Exception as ex: import traceback results.append({ "style" : "error", "title" : "An error occured: %s" % repr(ex) + "<br/>" + traceback.format_exc().replace("\n", "<br/>"), "snippet" : "<pre>%s</pre>" % debug_info }) return results
def lookup(self, key, raw=True, field="entity_id"): if key == 'entities' or key is None: if raw: return b2u(list(self.objects.values())) else: return b2u(list(self.infos.values())) from whoosh.qparser import QueryParser # import pdb; pdb.set_trace() key = key.strip('+') key = key.replace('+', ' AND ') for uri, a in list(ATTRS_INV.items()): key = key.replace(uri, a) key = " {!s} ".format(key) key = re.sub("([^=]+)=(\S+)", "\\1:\\2", key) key = re.sub("{([^}]+)}(\S+)", "\\1:\\2", key) key = key.strip() qp = QueryParser("object_id", schema=self.schema) q = qp.parse(key) lst = set() with self.index.searcher() as searcher: results = searcher.search(q, limit=None) for result in results: if raw: lst.add(self.objects[result['object_id']]) else: lst.add(self.infos[result['object_id']]) return b2u(list(lst))
def fulltext_search(request, query, limit): # query index index = get_index() parser = QueryParser('value', index.schema) with index.searcher() as searcher: query = parser.parse(query) results = searcher.search(query, limit=limit) results = results_to_instances(request, results) # helper for limit def limited(res): if limit is not None: return res[:limit] return res # if authenticated, return all draft and published results authenticated = bool(authenticated_userid(request)) if authenticated: return limited( [res for res in results if res.state in ['draft', 'published']]) # check for submitter submitter = get_submitter(request) if submitter: return limited( [res for res in results if res.state == 'published' or (res.state == 'draft' and submitter == res.submitter)]) # return only public results return limited([res for res in results if res.state == 'published'])
def search(self, q, page = 1, size = 30): searcher = self.ix.searcher() parser = QueryParser("content", schema=self.ix.schema, group=OrGroup) parser_rlt = parser.parse(q) #print unicode(parser_rlt) results = searcher.search_page(parser_rlt, page, size) return self.parse_results(results)
def searchNote(self): pattern = self.searchEdit.text() qres = [] with self.ix.searcher() as searcher: queryp = QueryParser("content", self.ix.schema) queryp.add_plugin(RegexPlugin()) query = queryp.parse('r"' + pattern + '"') # r"pattern" is the desired regex term format pathFacet = sorting.FieldFacet("path") scores = sorting.ScoreFacet() results = searcher.search( query, limit=None, sortedby=[pathFacet, scores]) # default limit is 10! for r in results: listItem = QListWidgetItem() title = r['title'] text = r['path'] term = r.highlights("content") qres.append([title, text, term]) html = """ <style> body { font-size: 14px; } .path { font-size: 12px; color: #009933; } </style> """ for ti, te, hi in qres: html += ("<p><a href='" + te + "'>" + ti + "</a><br/><span class='path'>" + te + "</span><br/>" + hi + "</p>") self.searchView.setHtml(html)
def __init__(self, search_term): search_term = re.sub( and_regex, ' AND ', search_term ) search_term = re.sub( or_regex, ' OR ', search_term) parser = QueryParser("content", schema=None) q = parser.parse(search_term) invalid = self.validate_search_term(q) if invalid: raise ValueError(invalid + search_term) myapp.db_connector.connect() session = myapp.db_connector.get_session() subq = session.query( TourneyList.id.label("tourney_list_id"), TourneyVenue.country.label("country_name"), TourneyVenue.state.label("state_name"), TourneyVenue.city.label("city_name"), TourneyVenue.venue.label("venue_name"), Tourney.tourney_type.label("tourney_type"), func.group_concat( ShipPilot.ship_type.distinct()).label("ship_name" ), func.group_concat( func.concat( Pilot.name, " ", Pilot.canon_name )).label("pilot_name"), func.group_concat( func.concat( Upgrade.name, " ", Upgrade.canon_name ) ).label("upgrade_name") ). \ join(Tourney).\ join(TourneyVenue).\ join(Ship). \ join(ShipPilot). \ join(Pilot). \ outerjoin(ShipUpgrade). \ outerjoin(Upgrade).\ group_by( TourneyList.id).subquery() fn = tree_to_expr(q, subq) self.query = session.query(subq.c.tourney_list_id).filter( fn )
def test_correct_query(): schema = fields.Schema(a=fields.TEXT(), b=fields.TEXT) with TempIndex(schema) as ix: with ix.writer() as w: w.add_document(a=u"alfa bravo charlie delta") w.add_document(a=u"delta echo foxtrot golf") w.add_document(a=u"golf hotel india juliet") w.add_document(a=u"juliet kilo lima mike") with ix.searcher() as s: qp = QueryParser("a", ix.schema) qtext = u'alpha ("brovo november" OR b:dolta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) cq = c.query assert isinstance(cq, query.And) assert cq[0].text == "alfa" assert isinstance(cq[1], query.Or) assert isinstance(cq[1][0], query.Phrase) assert cq[1][0].words == ["bravo", "november"] qtext = u'alpha b:("brovo november" a:delta) detail' q = qp.parse(qtext, ix.schema) c = s.correct_query(q, qtext) assert c.query.__unicode__() == '(a:alfa AND b:"brovo november" AND a:delta AND a:detail)' assert c.string == 'alfa b:("brovo november" a:delta) detail' hf = highlight.HtmlFormatter(classname="c") assert c.format_string(hf) == '<strong class="c term0">alfa</strong> b:("brovo november" a:delta) detail'
Quick Start Whoosh是一个索引文本和搜索文本的类库,他可以为你提供搜索文本的服务,比如如果你在创建一个博客的软件,你可以用whoosh为它添加添加一个搜索功能以便用户来搜索博客的入口 下面是一个简短的例子: from whoosh.index import create_in from whoosh.fields import * schema = Schema(title = TEXT(stored = True),path = ID(stored=True),content=TEXT) ix = create_in("/home/gswewf/百科/indexer",schema)#(这里的“indexer”实际上是一个目录,因此按照这个步骤来会出错,你得先创建目录,译者注) writer = ix.writer() writer.add_document(title=u"First document",path=u"/a", content = u"this is the first document we've add!") writer.add_document(title=u"Second document", path=u"/b", ... content=u"The second one is even more interesting!") writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("first") results = searcher.search(query) results[0] {"title": u"First document", "path": u"/a"} Index和Schema对象 在 开始使用whoosh之前,你需要一个index对象,在你第一次创建index对象时你必须定义一个Schema对象,Schema对象列出了 Index的所有域。 一个域就是Index对象里面每个document的一个信息,比如他的题目或者他的内容。一个域能够被索引(就是能被搜索到)或者 被存储(就是得到索引之后的结果, 这对于标题之类的索引非常有用) 下面这个schema对象有两个域,“title”和“content” from whoosh.fields import Schema,TEXT schema = Schema(title=TEXT,content=TEXT) 当你创建index的时候你创建一次schema对象就够了,schema是序列化并且和index存储的。
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ( 'AND', 'NOT', 'OR', 'TO', ) # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}', '[', ']', '^', '"', '~', '*', '?', ':', '.', ) def __init__(self, connection_alias, **connection_options): super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024) self.path = connection_options.get('PATH') if connection_options.get('STORAGE', 'file') != 'file': self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias) self.log = logging.getLogger('haystack') def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, 'RAM_STORE', None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema(connections[ self.connection_alias].get_unified_index().all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = '' for field_name, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost) elif field_class.field_type in ['date', 'datetime']: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True) elif field_class.field_type == 'integer': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost) elif field_class.field_type == 'float': schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost) elif field_class.field_type == 'boolean': # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored) elif field_class.field_type == 'ngram': schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost) elif field_class.field_type == 'edge_ngram': schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at='start', stored=field_class.stored, field_boost=field_class.boost) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug(u"Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if 'boost' in doc: del doc['boost'] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error(u"%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={ "data": { "index": index, "object": get_identifier(obj) } }) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True) def clear(self, models=None, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query( q=self.parser.parse(u" OR ".join(models_to_delete))) except Exception as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Whoosh index of models '%s': %s", ','.join(models_to_delete), e, exc_info=True) else: self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if end_offset is not None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return { 'results': [], 'hits': 0, } query_string = force_text(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != u'*': return { 'results': [], 'hits': 0, } reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith('-'): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError("Whoosh requires all order_by fields" " to use the same sort direction") for order_by in sort_by: if order_by.startswith('-'): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list if facets is not None: warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2) if date_facets is not None: warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2) if query_facets is not None: warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return { 'results': [], 'hits': 0, } page_num, page_length = self.calculate_page( start_offset, end_offset) search_kwargs = { 'pagelen': page_length, 'sortedby': sort_by, 'reverse': reverse, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs['filter'] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class) searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) else: spelling_suggestion = None return { 'results': [], 'hits': 0, 'spelling_suggestion': spelling_suggestion, } def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add(' OR '.join( ['%s:%s' % (DJANGO_CT, rm) for rm in model_choices])) if additional_query_string and additional_query_string != '*': narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_text(nq)), limit=None) if len(recent_narrowed_results) <= 0: return { 'results': [], 'hits': 0, } if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, 'filter'): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return { 'results': [], 'hits': 0, 'spelling_suggestion': None, } results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, 'close'): narrow_searcher.close() return results def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult facets = {} spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split('.') additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], 'convert'): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) == 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split( ',') else: additional_fields[string_key] = index.fields[ string_key].convert(value) else: additional_fields[string_key] = self._to_python(value) del (additional_fields[DJANGO_CT]) del (additional_fields[DJANGO_ID]) if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter('em') terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter) additional_fields['highlighted'] = { self.content_field_name: [whoosh_result], } result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query) else: spelling_suggestion = self.create_spelling_suggestion( query_string) return { 'results': results, 'hits': hits, 'facets': facets, 'spelling_suggestion': spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_text(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, '') for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, '') # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = ' '.join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, 'strftime'): if not hasattr(value, 'hour'): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = 'true' else: value = 'false' elif isinstance(value, (list, tuple)): value = u','.join([force_text(v) for v in value]) elif isinstance(value, (six.integer_types, float)): # Leave it alone. pass else: value = force_text(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == 'true': return True elif value == 'false': return False if value and isinstance(value, six.string_types): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'], date_values['minute'], date_values['second']) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, six.integer_types, float, complex)): return converted_value except: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
analyzer=analyzer) if not os.path.exists("tmp"): os.mkdir("tmp") ix = create_in("tmp", schema) writer = ix.writer() # add different docs writer.add_document(title="document1", path="/a", content="This is the first document we've added!") writer.add_document(title="document2", path="/b", content="The second one 用来测试中文吧 is even more interesting!") writer.commit() searcher = ix.searcher() parser = QueryParser("content", schema=ix.schema) for keywords in ("你", "first", "中文"): print(keywords + "results are as following: ") q = parser.parse(keywords) results = searcher.search(q) for hit in results: print(hit.highlights("content")) print("\n------------cut line-------------\n") for t in analyzer(""): print(t.text)
class WhooshSearchBackendTestCase(WhooshTestCase): fixtures = ['bulk_data.json'] def setUp(self): super(WhooshSearchBackendTestCase, self).setUp() self.old_ui = connections['whoosh'].get_unified_index() self.ui = UnifiedIndex() self.wmmi = WhooshMockSearchIndex() self.wmmidni = WhooshMockSearchIndexWithSkipDocument() self.wmtmmi = WhooshMaintainTypeMockSearchIndex() self.ui.build(indexes=[self.wmmi]) self.sb = connections['whoosh'].get_backend() connections['whoosh']._index = self.ui self.sb.setup() self.raw_whoosh = self.sb.index self.parser = QueryParser(self.sb.content_field_name, schema=self.sb.schema) self.sb.delete_index() self.sample_objs = MockModel.objects.all() def tearDown(self): connections['whoosh']._index = self.old_ui super(WhooshSearchBackendTestCase, self).tearDown() def whoosh_search(self, query): self.raw_whoosh = self.raw_whoosh.refresh() searcher = self.raw_whoosh.searcher() return searcher.search(self.parser.parse(query), limit=1000) def test_non_silent(self): bad_sb = connections['whoosh'].backend('bad', PATH='/tmp/bad_whoosh', SILENTLY_FAIL=False) bad_sb.use_file_storage = False bad_sb.storage = 'omg.wtf.bbq' try: bad_sb.update(self.wmmi, self.sample_objs) self.fail() except: pass try: bad_sb.remove('core.mockmodel.1') self.fail() except: pass try: bad_sb.clear() self.fail() except: pass try: bad_sb.search('foo') self.fail() except: pass def test_update(self): self.sb.update(self.wmmi, self.sample_objs) # Check what Whoosh thinks is there. self.assertEqual(len(self.whoosh_search(u'*')), 23) self.assertEqual( [doc.fields()['id'] for doc in self.whoosh_search(u'*')], [u'core.mockmodel.%s' % i for i in range(1, 24)]) def test_update_with_SkipDocument_raised(self): self.sb.update(self.wmmidni, self.sample_objs) # Check what Whoosh thinks is there. res = self.whoosh_search(u'*') self.assertEqual(len(res), 14) ids = [1, 2, 5, 6, 7, 8, 9, 11, 12, 14, 15, 18, 20, 21] self.assertListEqual([doc.fields()['id'] for doc in res], [u'core.mockmodel.%s' % i for i in ids]) def test_remove(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(self.sb.index.doc_count(), 23) self.sb.remove(self.sample_objs[0]) self.assertEqual(self.sb.index.doc_count(), 22) def test_clear(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(self.sb.index.doc_count(), 23) self.sb.clear() self.assertEqual(self.sb.index.doc_count(), 0) self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(self.sb.index.doc_count(), 23) self.sb.clear([AnotherMockModel]) self.assertEqual(self.sb.index.doc_count(), 23) self.sb.clear([MockModel]) self.assertEqual(self.sb.index.doc_count(), 0) self.sb.index.refresh() self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(self.sb.index.doc_count(), 23) self.sb.clear([AnotherMockModel, MockModel]) self.assertEqual(self.raw_whoosh.doc_count(), 0) def test_search(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u'*')), 23) # No query string should always yield zero results. self.assertEqual(self.sb.search(u''), {'hits': 0, 'results': []}) # A one letter query string gets nabbed by a stopwords filter. Should # always yield zero results. self.assertEqual(self.sb.search(u'a'), {'hits': 0, 'results': []}) # Possible AttributeError? # self.assertEqual(self.sb.search(u'a b'), {'hits': 0, 'results': [], 'spelling_suggestion': '', 'facets': {}}) self.assertEqual(self.sb.search(u'*')['hits'], 23) self.assertEqual( [result.pk for result in self.sb.search(u'*')['results']], [u'%s' % i for i in range(1, 24)]) self.assertEqual(self.sb.search(u'Indexe')['hits'], 23) self.assertEqual( self.sb.search(u'Indexe')['spelling_suggestion'], u'indexed') self.assertEqual(self.sb.search(u'', facets=['name']), { 'hits': 0, 'results': [] }) results = self.sb.search(u'Index*', facets=['name']) results = self.sb.search(u'index*', facets=['name']) self.assertEqual(results['hits'], 23) self.assertEqual(results['facets'], {}) self.assertEqual( self.sb.search(u'', date_facets={ 'pub_date': { 'start_date': date(2008, 2, 26), 'end_date': date(2008, 2, 26), 'gap': '/MONTH' } }), { 'hits': 0, 'results': [] }) results = self.sb.search(u'Index*', date_facets={ 'pub_date': { 'start_date': date(2008, 2, 26), 'end_date': date(2008, 2, 26), 'gap': '/MONTH' } }) results = self.sb.search(u'index*', date_facets={ 'pub_date': { 'start_date': date(2008, 2, 26), 'end_date': date(2008, 2, 26), 'gap': '/MONTH' } }) self.assertEqual(results['hits'], 23) self.assertEqual(results['facets'], {}) self.assertEqual( self.sb.search(u'', query_facets={'name': '[* TO e]'}), { 'hits': 0, 'results': [] }) results = self.sb.search(u'Index*', query_facets={'name': '[* TO e]'}) results = self.sb.search(u'index*', query_facets={'name': '[* TO e]'}) self.assertEqual(results['hits'], 23) self.assertEqual(results['facets'], {}) # self.assertEqual(self.sb.search('', narrow_queries=set(['name:daniel1'])), {'hits': 0, 'results': []}) # results = self.sb.search('Index*', narrow_queries=set(['name:daniel1'])) # self.assertEqual(results['hits'], 1) # Ensure that swapping the ``result_class`` works. self.assertTrue( isinstance( self.sb.search(u'Index*', result_class=MockSearchResult)['results'][0], MockSearchResult)) # Check the use of ``limit_to_registered_models``. self.assertEqual(self.sb.search(u'', limit_to_registered_models=False), { 'hits': 0, 'results': [] }) self.assertEqual( self.sb.search(u'*', limit_to_registered_models=False)['hits'], 23) self.assertEqual([ result.pk for result in self.sb.search( u'*', limit_to_registered_models=False)['results'] ], [u'%s' % i for i in range(1, 24)]) # Stow. old_limit_to_registered_models = getattr( settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True) settings.HAYSTACK_LIMIT_TO_REGISTERED_MODELS = False self.assertEqual(self.sb.search(u''), {'hits': 0, 'results': []}) self.assertEqual(self.sb.search(u'*')['hits'], 23) self.assertEqual( [result.pk for result in self.sb.search(u'*')['results']], [u'%s' % i for i in range(1, 24)]) # Restore. settings.HAYSTACK_LIMIT_TO_REGISTERED_MODELS = old_limit_to_registered_models def test_highlight(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u'*')), 23) self.assertEqual(self.sb.search(u'', highlight=True), { 'hits': 0, 'results': [] }) self.assertEqual(self.sb.search(u'index*', highlight=True)['hits'], 23) query = self.sb.search('Index*', highlight=True)['results'] result = [result.highlighted['text'][0] for result in query] self.assertEqual(result, ['<em>Indexed</em>!\n%d' % i for i in range(1, 24)]) def test_search_all_models(self): wamsi = WhooshAnotherMockSearchIndex() self.ui.build(indexes=[self.wmmi, wamsi]) self.sb.update(self.wmmi, self.sample_objs) self.sb.update(wamsi, AnotherMockModel.objects.all()) self.assertEqual(len(self.whoosh_search(u'*')), 25) self.ui.build(indexes=[self.wmmi]) def test_more_like_this(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u'*')), 23) # Now supported by Whoosh (as of 1.8.4). See the ``LiveWhooshMoreLikeThisTestCase``. self.assertEqual( self.sb.more_like_this(self.sample_objs[0])['hits'], 22) # Make sure that swapping the ``result_class`` doesn't blow up. try: self.sb.more_like_this(self.sample_objs[0], result_class=MockSearchResult) except: self.fail() def test_delete_index(self): self.sb.update(self.wmmi, self.sample_objs) self.assertTrue(self.sb.index.doc_count() > 0) self.sb.delete_index() self.assertEqual(self.sb.index.doc_count(), 0) def test_order_by(self): self.sb.update(self.wmmi, self.sample_objs) results = self.sb.search(u'*', sort_by=['pub_date']) self.assertEqual([result.pk for result in results['results']], [ u'1', u'3', u'2', u'4', u'5', u'6', u'7', u'8', u'9', u'10', u'11', u'12', u'13', u'14', u'15', u'16', u'17', u'18', u'19', u'20', u'21', u'22', u'23' ]) results = self.sb.search(u'*', sort_by=['-pub_date']) self.assertEqual([result.pk for result in results['results']], [ u'23', u'22', u'21', u'20', u'19', u'18', u'17', u'16', u'15', u'14', u'13', u'12', u'11', u'10', u'9', u'8', u'7', u'6', u'5', u'4', u'2', u'3', u'1' ]) results = self.sb.search(u'*', sort_by=['id']) self.assertEqual([result.pk for result in results['results']], [ u'1', u'10', u'11', u'12', u'13', u'14', u'15', u'16', u'17', u'18', u'19', u'2', u'20', u'21', u'22', u'23', u'3', u'4', u'5', u'6', u'7', u'8', u'9' ]) results = self.sb.search(u'*', sort_by=['-id']) self.assertEqual([result.pk for result in results['results']], [ u'9', u'8', u'7', u'6', u'5', u'4', u'3', u'23', u'22', u'21', u'20', u'2', u'19', u'18', u'17', u'16', u'15', u'14', u'13', u'12', u'11', u'10', u'1' ]) results = self.sb.search(u'*', sort_by=['-pub_date', '-id']) self.assertEqual([result.pk for result in results['results']], [ u'23', u'22', u'21', u'20', u'19', u'18', u'17', u'16', u'15', u'14', u'13', u'12', u'11', u'10', u'9', u'8', u'7', u'6', u'5', u'4', u'2', u'3', u'1' ]) self.assertRaises(SearchBackendError, self.sb.search, u'*', sort_by=['-pub_date', 'id']) def test__from_python(self): self.assertEqual(self.sb._from_python('abc'), u'abc') self.assertEqual(self.sb._from_python(1), 1) self.assertEqual(self.sb._from_python(2653), 2653) self.assertEqual(self.sb._from_python(25.5), 25.5) self.assertEqual(self.sb._from_python([1, 2, 3]), u'1,2,3') self.assertTrue("a': 1" in self.sb._from_python({ 'a': 1, 'c': 3, 'b': 2 })) self.assertEqual(self.sb._from_python(datetime(2009, 5, 9, 16, 14)), datetime(2009, 5, 9, 16, 14)) self.assertEqual(self.sb._from_python(datetime(2009, 5, 9, 0, 0)), datetime(2009, 5, 9, 0, 0)) self.assertEqual(self.sb._from_python(datetime(1899, 5, 18, 0, 0)), datetime(1899, 5, 18, 0, 0)) self.assertEqual( self.sb._from_python(datetime(2009, 5, 18, 1, 16, 30, 250)), datetime(2009, 5, 18, 1, 16, 30, 250)) def test__to_python(self): self.assertEqual(self.sb._to_python('abc'), 'abc') self.assertEqual(self.sb._to_python('1'), 1) self.assertEqual(self.sb._to_python('2653'), 2653) self.assertEqual(self.sb._to_python('25.5'), 25.5) self.assertEqual(self.sb._to_python('[1, 2, 3]'), [1, 2, 3]) self.assertEqual(self.sb._to_python('{"a": 1, "b": 2, "c": 3}'), { 'a': 1, 'c': 3, 'b': 2 }) self.assertEqual(self.sb._to_python('2009-05-09T16:14:00'), datetime(2009, 5, 9, 16, 14)) self.assertEqual(self.sb._to_python('2009-05-09T00:00:00'), datetime(2009, 5, 9, 0, 0)) self.assertEqual(self.sb._to_python(None), None) def test_range_queries(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u'[d TO]')), 23) self.assertEqual(len(self.whoosh_search(u'name:[d TO]')), 23) self.assertEqual(len(self.whoosh_search(u'Ind* AND name:[d to]')), 23) self.assertEqual(len(self.whoosh_search(u'Ind* AND name:[to c]')), 0) def test_date_queries(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u"pub_date:20090717003000")), 1) self.assertEqual(len(self.whoosh_search(u"pub_date:20090717000000")), 0) self.assertEqual( len(self.whoosh_search(u'Ind* AND pub_date:[to 20090717003000]')), 3) def test_escaped_characters_queries(self): self.sb.update(self.wmmi, self.sample_objs) self.assertEqual(len(self.whoosh_search(u"Indexed\!")), 23) self.assertEqual( len(self.whoosh_search(u"http\:\/\/www\.example\.com")), 0) def test_build_schema(self): ui = UnifiedIndex() ui.build(indexes=[AllTypesWhooshMockSearchIndex()]) (content_field_name, schema) = self.sb.build_schema(ui.all_searchfields()) self.assertEqual(content_field_name, 'text') self.assertEqual(len(schema.names()), 9) self.assertEqual(schema.names(), [ 'django_ct', 'django_id', 'id', 'is_active', 'name', 'pub_date', 'seen_count', 'sites', 'text' ]) self.assertTrue(isinstance(schema._fields['text'], TEXT)) self.assertTrue(isinstance(schema._fields['pub_date'], DATETIME)) self.assertTrue(isinstance(schema._fields['seen_count'], NUMERIC)) self.assertTrue(isinstance(schema._fields['sites'], KEYWORD)) self.assertTrue(isinstance(schema._fields['is_active'], BOOLEAN)) def test_verify_type(self): old_ui = connections['whoosh'].get_unified_index() ui = UnifiedIndex() wmtmmi = WhooshMaintainTypeMockSearchIndex() ui.build(indexes=[wmtmmi]) connections['whoosh']._index = ui sb = connections['whoosh'].get_backend() sb.setup() sb.update(wmtmmi, self.sample_objs) self.assertEqual(sb.search(u'*')['hits'], 23) self.assertEqual( [result.month for result in sb.search(u'*')['results']], [ u'06', u'07', u'06', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07', u'07' ]) connections['whoosh']._index = old_ui @unittest.skipIf( settings.HAYSTACK_CONNECTIONS['whoosh'].get('STORAGE') != 'file', 'testing writability requires Whoosh to use STORAGE=file') def test_writable(self): if not os.path.exists(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']): os.makedirs(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']) os.chmod(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'], 0o400) try: self.sb.setup() self.fail() except IOError: # Yay. We failed pass os.chmod(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'], 0o755) def test_slicing(self): self.sb.update(self.wmmi, self.sample_objs) page_1 = self.sb.search(u'*', start_offset=0, end_offset=20) page_2 = self.sb.search(u'*', start_offset=20, end_offset=30) self.assertEqual(len(page_1['results']), 20) self.assertEqual([result.pk for result in page_1['results']], [u'%s' % i for i in range(1, 21)]) self.assertEqual(len(page_2['results']), 3) self.assertEqual([result.pk for result in page_2['results']], [u'21', u'22', u'23']) # This used to throw an error. page_0 = self.sb.search(u'*', start_offset=0, end_offset=0) self.assertEqual(len(page_0['results']), 1) @unittest.expectedFailure def test_scoring(self): self.sb.update(self.wmmi, self.sample_objs) page_1 = self.sb.search(u'index', start_offset=0, end_offset=20) page_2 = self.sb.search(u'index', start_offset=20, end_offset=30) self.assertEqual(len(page_1['results']), 20) self.assertEqual( ["%0.2f" % result.score for result in page_1['results']], [ '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.51', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40', '0.40' ]) self.assertEqual(len(page_2['results']), 3) self.assertEqual( ["%0.2f" % result.score for result in page_2['results']], ['0.40', '0.40', '0.40'])
#!/usr/bin/env python from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh.query import * ix = open_dir("index_dir") with ix.searcher() as searcher: # Poner while True si queremos introducir el input por el terminal. while False: text = input("Dime:") if len(text) == 0: break query = QueryParser("content", ix.schema).parse(text) results = searcher.search(query) # print(dir(results)) # print(results.docs) for r in results: # Devuelve el ID global y la posición de la noticia en el documento. print("ID global:", r["doc"] + ".", "Número de noticia:", r["num_noticia"]) # Si hay menos de 4 ficheros, se imprime la noticia entera. if len(results) < 4: f = open("./enero/" + r["doc"], mode='r') f = str(f.read()).split("<DOC>") print(f[int(r["num_noticia"])]) # Documentos que contengan el texto "valencia" with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse("valencia") results = searcher.search(query)
loadFile(writer, filename) ############# END for walkFolder walkFolder(writer, "C:\DSA") # Commit changes writer.commit() # save changes # Get input, conver to unicode qstr = input("Input a qeury: ") print("searching for ", qstr) #################################### # Build query parser and parse query #################################### qp = QueryParser("content", schema=ix.schema) q = qp.parse(qstr) print(q) #################################### # Search the content field #################################### with ix.searcher(weighting=scoring.TF_IDF()) as s: results = s.search(q) for hit in results: print("Cell {} of Notebook '{}'".format(hit['cell_no'], hit['filename']))
from django.http import JsonResponse from documents.models import CodeOKVED, CodeOKS from documents.models import Documents from documents.serializers import DocumentsSerializer from search.models import Search, AutoCompletion, SearchHistory from search.serializers import SearchOptionsSerializer from settings.common import SEARCH_ENGINE from whoosh.fields import * from whoosh.qparser import QueryParser LOGGER = logging.getLogger('django') if settings.SEARCH_ENGINE.get('name') == 'whoosh': ix = index.open_dir(SEARCH_ENGINE.get('indexdir')) CLIENT = ix.searcher() QUERY_DOC_KIND = QueryParser("doc_kind", ix.schema) QUERY_DOC_MARK = QueryParser("doc_mark", ix.schema) QUERY_DOC_NAME_RU = QueryParser("doc_name_ru", ix.schema) QUERY_DOC_NAME_EN = QueryParser("doc_name_en", ix.schema) QUERY_DOC_NAME_ANNOTATION = QueryParser("doc_annotation", ix.schema) QUERY_DOC_COMMENT = QueryParser("doc_comment", ix.schema) QUERY_DOC_FULL_MARK = QueryParser("doc_full_mark", ix.schema) QUERY_DOC_STATUS = QueryParser("doc_status", ix.schema) QUERY_TK_RUS = QueryParser("tk_rus", ix.schema) QUERY_MTK_DEV = QueryParser("mtk_dev", ix.schema) QUERY_KEYWORDS = QueryParser("keywords", ix.schema) QUERY_DOC_NAME_RU.parse("акустика") res = CLIENT.search(QUERY_DOC_NAME_RU) print(f'Test request results={res[0].get("doc_id")}')
def search_database(keyword): with index.searcher() as searcher: query = QueryParser('name', index.schema).parse(keyword) return searcher.search(query)
def search(): query = request.form['query'] q = [] q.append(query) r = [] #complete path c = [] #preview of the paste content paste_date = [] paste_size = [] index_name = request.form['index_name'] num_elem_to_get = 50 # select correct index if index_name is None or index_name == "0": selected_index = get_current_index() else: selected_index = os.path.join(baseindexpath, index_name) # Search filename for path in r_serv_pasteName.smembers(q[0]): r.append(path) paste = Paste.Paste(path) content = paste.get_p_content().decode('utf8', 'ignore') content_range = max_preview_char if len( content) > max_preview_char else len(content) - 1 c.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4] + '/' + curr_date[4:6] + '/' + curr_date[6:] paste_date.append(curr_date) paste_size.append(paste._get_p_size()) # Search full line schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) for x in results: r.append(x.items()[0][1]) paste = Paste.Paste(x.items()[0][1]) content = paste.get_p_content().decode('utf8', 'ignore') content_range = max_preview_char if len( content) > max_preview_char else len(content) - 1 c.append(content[0:content_range]) curr_date = str(paste._get_p_date()) curr_date = curr_date[0:4] + '/' + curr_date[ 4:6] + '/' + curr_date[6:] paste_date.append(curr_date) paste_size.append(paste._get_p_size()) results = searcher.search(query) num_res = len(results) index_min = 1 index_max = len(get_index_list()) return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res, index_min=index_min, index_max=index_max, index_list=get_index_list(selected_index))
# -*- coding: utf-8 -*- """ Created on Mon Mar 4 20:01:53 2019 @author: Fachry Firdaus """ from whoosh.qparser import QueryParser from whoosh import scoring from whoosh.index import open_dir import sys ix = open_dir("indexdir") # query_str is query string lanjut = "Y" while (lanjut == "Y"): query_str = input('Apa yang ingin anda telusuri? \n') # Top 'n' documents as result topN = 50 with ix.searcher(weighting=scoring.Frequency) as searcher: query = QueryParser("content", ix.schema).parse(query_str) results = searcher.search(query, limit=topN) for i in range(topN): print("Nama dokumen : ", results[i]['title'], "\tscore : ", str(results[i].score), "\n", results[i]['textdata'], "\n") lanjut = input("Ingin menelusuri lagi? Y/N \n")
class MySchema(SchemaClass): path = ID(stored=True) title = TEXT(stored=True) content = TEXT tags = KEYWORD icon = TEXT # stored = True 表示返回结果字段 if not os.path.exists('index'): os.mkdir('index') indeeex = create_in('index', MySchema) # 用create_in创建一个具有前述索引模式的索引存储目录对象,所有的索引将被保存在该目录(index)中。 writer = indeeex.writer() writer.add_document(title=u"my document", content=u"this is my document", path=u"/a", tags=u"firlst short", icon=u"/icons/star.png") writer.add_document(title=u"my second document", content=u"this is my second document", path=u"/b", tags=u"second short", icon=u"/icons/sheep.png") writer.commit() # searcher = indeeex.searcher() # 和open()类似 用with方法保持状态 with indeeex.searcher() as searcher: # do something query = QueryParser("content", indeeex.schema).parse("second") result = searcher.search(query) print(result) print(list(result))
if count != 0: score /= count return score def symmetric_sentence_similarity(sentence1, sentence2): """ compute the symmetric sentence similarity using Wordnet """ return (sentence_similarity(sentence1, sentence2) + sentence_similarity(sentence2, sentence1)) / 2 model = {} max_info = {} ix = open_dir("WikiSplit") # use BM25F to calculate the similarity between title and query searcher = ix.searcher() schema = ix.schema parser = QueryParser("title",schema) # parser.add_plugin(MultifieldPlugin(["title","sent"])) # claimsWithId contains (id, claim) pairs claimTogether,claimsWithId = readClaims("train.json") # Get the entities from all claims(for time saving) st = StanfordNERTagger('english.conll.4class.distsim.crf.ser.gz', 'stanford-ner.jar') NerSen = st.tag(nltk.word_tokenize(claimTogether)) # NumOfClaim record the number of claims that have been processed NumOfClaim = 0 TaggedClaim = [] claims = {} start = time.time()
def searchPapers_whoosh(year=None, author=None, topic=None, userQuery=None): # Open the existing index import whoosh.index as index import nltk nltk.download('wordnet') from nltk.stem.wordnet import WordNetLemmatizer lemma = WordNetLemmatizer() userQuery = " ".join( lemma.lemmatize(word, 'n') for word in userQuery.split()) userQuery = " ".join( lemma.lemmatize(word, 'v') for word in userQuery.split()) index_dir = "../index" ix = index.open_dir(index_dir) if topic == 'All the topics': topic = None if year == 'All the years': year = None # Parse with filter on fields from whoosh import query from whoosh import qparser from whoosh.qparser import QueryParser from whoosh.qparser import MultifieldParser with ix.searcher() as s: if (not userQuery): qp = QueryParser("id", schema=ix.schema) user_q = qp.parse("*") else: # 0 = importance to documents with one of the terms # 1 = importance to documents with all of the terms og = qparser.OrGroup.factory(0.8) # search both in title and text mparser = MultifieldParser(["title", "paper_text"], schema=ix.schema, group=og) user_q = mparser.parse(userQuery) # Filter results for fields allow_q = query.NullQuery if (year): allow_q = allow_q & query.Term("year", year) if (author): formattedAuthors = author.lower().split() for fa in formattedAuthors: fa = "*" + fa + "*" allow_q = allow_q & query.Wildcard("authors", fa) if (topic): topicParser = qparser.QueryParser("topic", ix.schema) allow_q = allow_q & topicParser.parse('"' + topic + '"') if (not year and not author and not topic): results = s.search(user_q, limit=50) else: results = s.search(user_q, filter=allow_q, limit=50) papers = [] for result in results: papers.extend([int(result['id'])]) return papers
def filter_queryset(self, request, queryset, view): if ('parent' in request.query_params and request.query_params['parent'] == ''): # Empty string means query for null parent queryset = queryset.filter(parent=None) try: q = request.query_params['q'] except KeyError: return queryset # Short-circuit some commonly used queries COMMON_QUERY_TO_ORM_FILTER = { 'asset_type:block': { 'asset_type': 'block' }, 'asset_type:question': { 'asset_type': 'question' }, 'asset_type:survey': { 'asset_type': 'survey' }, 'asset_type:question OR asset_type:block': { 'asset_type__in': ('question', 'block') } } try: return queryset.filter(**COMMON_QUERY_TO_ORM_FILTER[q]) except KeyError: # We don't know how to short-circuit this query; pass it along to # the search engine pass except FieldError: # The user passed a query we recognized as commonly-used, but the # field was invalid for the requested model return queryset.none() queryset_pks = list(queryset.values_list('pk', flat=True)) if not len(queryset_pks): return queryset # 'q' means do a full-text search of the document fields, where the # critera are given in the Whoosh query language: # https://pythonhosted.org/Whoosh/querylang.html search_queryset = SearchQuerySet().models(queryset.model) search_backend = search_queryset.query.backend if not isinstance(search_backend, WhooshSearchBackend): raise NotImplementedError( 'Only the Whoosh search engine is supported at this time') if not search_backend.setup_complete: search_backend.setup() # Parse the user's query user_query = QueryParser('text', search_backend.index.schema).parse(q) # Construct a query to restrict the search to the appropriate model filter_query = Term(DJANGO_CT, get_model_ct(queryset.model)) # Does the search index for this model have a field that allows # filtering by permissions? haystack_index = haystack.connections['default'].get_unified_index( ).get_index(queryset.model) if hasattr(haystack_index, 'users_granted_permission'): # Also restrict the search to records that the user can access filter_query &= Term('users_granted_permission', request.user.username) with search_backend.index.searcher() as searcher: results = searcher.search(user_query, filter=filter_query, scored=False, sortedby=None, limit=None) if not results: # We got nothing; is the search index even valid? if not searcher.search(filter_query, limit=1): # Thre's not a single entry in the search index for this # model; assume the index is invalid and return the # queryset untouched return queryset pk_type = type(queryset_pks[0]) results_pks = { # Coerce each `django_id` from unicode to the appropriate type, # usually `int` pk_type((x['django_id'])) for x in results } filter_pks = results_pks.intersection(queryset_pks) return queryset.filter(pk__in=filter_pks)
# #索引构建,基于路径的基本索引 # writer = ix.writer() # count = 0 # # 遍历根目录对索引文本内容构建索引 # for root, dirs, files in os.walk(file_path, topdown=True): # for file in files: # path_t = os.path.join(root, file) # if path_t.split('.')[-1] != 'txt' or path_t.split('\\')[-1] =='index.txt': # continue # print("=======>"+path_t,file) # f = open( path_t, 'r', encoding='UTF-8') # content = '' # for line in f: # content = content + line # writer.add_document(title=file, content=content, path= path_t) # count =count+1 # writer.commit() # print("==========>共索引文件%d个"%count) #查询构建,测试索引构建的效果 from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content",ix.schema).parse("信息检索") result = searcher.search(query) for item in result: print(item) print(len(result))
def _query_projects(self, searcher, querystring, page=1): parser = QueryParser("text", self.project_ix.schema, plugins=self._query_parser_plugins()) query = parser.parse(querystring) return self._search_projects(searcher, query, page=page)
path = "../data/FranchiseList.txt" file = open(path, "r") fileContent = str(file.read()) file.close() lines = fileContent.split("\n") # Indexation des documents writer = ix.writer() for l in lines: writer.add_document(Name=l) print(l) writer.commit() with ix.searcher() as searcher: query = QueryParser("Name", ix.schema).parse(u'Zero') results = searcher.search(query) # Résultats found = results.scored_length() if results.has_exact_length(): print("Scored", found, "of exactly", len(results), "documents") else: low = results.estimated_min_length() high = results.estimated_length() print("Scored", found, "of between", low, "and", high, "documents") for r in results: print(r)
from functools import reduce import whoosh.index as index import json import helpers from index_posts import schema from whoosh.qparser import QueryParser ix = index.open_dir("indexdir") searcher = ix.searcher() qp = QueryParser('tokens', schema=schema) if __name__ == "__main__": successes = dict(zip(range(0, 20), [0] * 20)) with open('../dataset.json') as dataset_file: for (post_id, post) in json.load(dataset_file).items(): q = qp.parse(' '.join( helpers.preprocess_text(post['Children'][0]['Title']))) print(q) print(post['Children'][0]['Title']) print(post['Title']) results = searcher.search(q, limit=20) if len(results) > 0: for index, result in enumerate(results): if result['post_id'] == post_id: successes[index] += 1 print(result['title']) print('Success: True') break
class WhooshSearchBackend(BaseSearchBackend): # Word reserved by Whoosh for special use. RESERVED_WORDS = ("AND", "NOT", "OR", "TO") # Characters reserved by Whoosh for special use. # The '\\' must come first, so as not to overwrite the other slash replacements. RESERVED_CHARACTERS = ( "\\", "+", "-", "&&", "||", "!", "(", ")", "{", "}", "[", "]", "^", '"', "~", "*", "?", ":", ".", ) def __init__(self, connection_alias, **connection_options): super().__init__(connection_alias, **connection_options) self.setup_complete = False self.use_file_storage = True self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024) self.path = connection_options.get("PATH") if connection_options.get("STORAGE", "file") != "file": self.use_file_storage = False if self.use_file_storage and not self.path: raise ImproperlyConfigured( "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias ) self.log = logging.getLogger("haystack") def setup(self): """ Defers loading until needed. """ from haystack import connections new_index = False # Make sure the index is there. if self.use_file_storage and not os.path.exists(self.path): os.makedirs(self.path) new_index = True if self.use_file_storage and not os.access(self.path, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % self.path ) if self.use_file_storage: self.storage = FileStorage(self.path) else: global LOCALS if getattr(LOCALS, "RAM_STORE", None) is None: LOCALS.RAM_STORE = RamStorage() self.storage = LOCALS.RAM_STORE self.content_field_name, self.schema = self.build_schema( connections[self.connection_alias].get_unified_index().all_searchfields() ) self.parser = QueryParser(self.content_field_name, schema=self.schema) self.parser.add_plugins([FuzzyTermPlugin]) if new_index is True: self.index = self.storage.create_index(self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = self.storage.create_index(self.schema) self.setup_complete = True def build_schema(self, fields): schema_fields = { ID: WHOOSH_ID(stored=True, unique=True), DJANGO_CT: WHOOSH_ID(stored=True), DJANGO_ID: WHOOSH_ID(stored=True), } # Grab the number of keys that are hard-coded into Haystack. # We'll use this to (possibly) fail slightly more gracefully later. initial_key_count = len(schema_fields) content_field_name = "" for _, field_class in fields.items(): if field_class.is_multivalued: if field_class.indexed is False: schema_fields[field_class.index_fieldname] = IDLIST( stored=True, field_boost=field_class.boost ) else: schema_fields[field_class.index_fieldname] = KEYWORD( stored=True, commas=True, scorable=True, field_boost=field_class.boost, ) elif field_class.field_type in ["date", "datetime"]: schema_fields[field_class.index_fieldname] = DATETIME( stored=field_class.stored, sortable=True ) elif field_class.field_type == "integer": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=int, field_boost=field_class.boost, ) elif field_class.field_type == "float": schema_fields[field_class.index_fieldname] = NUMERIC( stored=field_class.stored, numtype=float, field_boost=field_class.boost, ) elif field_class.field_type == "boolean": # Field boost isn't supported on BOOLEAN as of 1.8.2. schema_fields[field_class.index_fieldname] = BOOLEAN( stored=field_class.stored ) elif field_class.field_type == "ngram": schema_fields[field_class.index_fieldname] = NGRAM( minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost, ) elif field_class.field_type == "edge_ngram": schema_fields[field_class.index_fieldname] = NGRAMWORDS( minsize=2, maxsize=15, at="start", stored=field_class.stored, field_boost=field_class.boost, ) else: schema_fields[field_class.index_fieldname] = TEXT( stored=True, analyzer=ChineseAnalyzer(), field_boost=field_class.boost, sortable=True, ) if field_class.document is True: content_field_name = field_class.index_fieldname schema_fields[field_class.index_fieldname].spelling = True # Fail more gracefully than relying on the backend to die if no fields # are found. if len(schema_fields) <= initial_key_count: raise SearchBackendError( "No fields were found in any search_indexes. Please correct this before attempting to search." ) return (content_field_name, Schema(**schema_fields)) def update(self, index, iterable, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() writer = AsyncWriter(self.index) for obj in iterable: try: doc = index.full_prepare(obj) except SkipDocument: self.log.debug("Indexing for object `%s` skipped", obj) else: # Really make sure it's unicode, because Whoosh won't have it any # other way. for key in doc: doc[key] = self._from_python(doc[key]) # Document boosts aren't supported in Whoosh 2.5.0+. if "boost" in doc: del doc["boost"] try: writer.update_document(**doc) except Exception as e: if not self.silently_fail: raise # We'll log the object identifier but won't include the actual object # to avoid the possibility of that generating encoding errors while # processing the log message: self.log.error( "%s while preparing object for update" % e.__class__.__name__, exc_info=True, extra={"data": {"index": index, "object": get_identifier(obj)}}, ) if len(iterable) > 0: # For now, commit no matter what, as we run into locking issues otherwise. writer.commit() if writer.ident is not None: writer.join() def remove(self, obj_or_string, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() whoosh_id = get_identifier(obj_or_string) try: self.index.delete_by_query(q=self.parser.parse('%s:"%s"' % (ID, whoosh_id))) except Exception as e: if not self.silently_fail: raise self.log.error( "Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True, ) def clear(self, models=None, commit=True): if not self.setup_complete: self.setup() self.index = self.index.refresh() if models is not None: assert isinstance(models, (list, tuple)) try: if models is None: self.delete_index() else: models_to_delete = [] for model in models: models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model))) self.index.delete_by_query( q=self.parser.parse(" OR ".join(models_to_delete)) ) except Exception as e: if not self.silently_fail: raise if models is not None: self.log.error( "Failed to clear Whoosh index of models '%s': %s", ",".join(models_to_delete), e, exc_info=True, ) else: self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True) def delete_index(self): # Per the Whoosh mailing list, if wiping out everything from the index, # it's much more efficient to simply delete the index files. if self.use_file_storage and os.path.exists(self.path): shutil.rmtree(self.path) elif not self.use_file_storage: self.storage.clean() # Recreate everything. self.setup() def optimize(self): if not self.setup_complete: self.setup() self.index = self.index.refresh() self.index.optimize() def calculate_page(self, start_offset=0, end_offset=None): # Prevent against Whoosh throwing an error. Requires an end_offset # greater than 0. if end_offset is not None and end_offset <= 0: end_offset = 1 # Determine the page. page_num = 0 if end_offset is None: end_offset = 1000000 if start_offset is None: start_offset = 0 page_length = end_offset - start_offset if page_length and page_length > 0: page_num = int(start_offset / page_length) # Increment because Whoosh uses 1-based page numbers. page_num += 1 return page_num, page_length @log_query def search( self, query_string, sort_by=None, start_offset=0, end_offset=None, fields="", highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs ): if not self.setup_complete: self.setup() # A zero length query should return no results. if len(query_string) == 0: return {"results": [], "hits": 0} query_string = force_str(query_string) # A one-character query (non-wildcard) gets nabbed by a stopwords # filter and should yield zero results. if len(query_string) <= 1 and query_string != "*": return {"results": [], "hits": 0} reverse = False if sort_by is not None: # Determine if we need to reverse the results and if Whoosh can # handle what it's being asked to sort by. Reversing is an # all-or-nothing action, unfortunately. sort_by_list = [] reverse_counter = 0 for order_by in sort_by: if order_by.startswith("-"): reverse_counter += 1 if reverse_counter and reverse_counter != len(sort_by): raise SearchBackendError( "Whoosh requires all order_by fields" " to use the same sort direction" ) for order_by in sort_by: if order_by.startswith("-"): sort_by_list.append(order_by[1:]) if len(sort_by_list) == 1: reverse = True else: sort_by_list.append(order_by) if len(sort_by_list) == 1: reverse = False sort_by = sort_by_list group_by = [] facet_types = {} if facets is not None: group_by += [ FieldFacet(facet, allow_overlap=True, maptype=Count) for facet in facets ] facet_types.update({facet: "fields" for facet in facets}) if date_facets is not None: def _fixup_datetime(dt): if isinstance(dt, datetime): return dt if isinstance(dt, date): return datetime(dt.year, dt.month, dt.day) raise ValueError for key, value in date_facets.items(): start = _fixup_datetime(value["start_date"]) end = _fixup_datetime(value["end_date"]) gap_by = value["gap_by"] gap_amount = value.get("gap_amount", 1) gap = RelativeDelta(**{"%ss" % gap_by: gap_amount}) group_by.append(DateRangeFacet(key, start, end, gap, maptype=Count)) facet_types[key] = "dates" if query_facets is not None: warnings.warn( "Whoosh does not handle query faceting.", Warning, stacklevel=2 ) narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True ) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add( " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]) ) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_str(nq)), limit=None ) if len(recent_narrowed_results) <= 0: return {"results": [], "hits": 0} if narrowed_results is not None: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results self.index = self.index.refresh() if self.index.doc_count(): searcher = self.index.searcher() parsed_query = self.parser.parse(query_string) # In the event of an invalid/stopworded query, recover gracefully. if parsed_query is None: return {"results": [], "hits": 0} page_num, page_length = self.calculate_page(start_offset, end_offset) search_kwargs = { "pagelen": page_length, "sortedby": sort_by, "reverse": reverse, "groupedby": group_by, } # Handle the case where the results have been narrowed. if narrowed_results is not None: search_kwargs["filter"] = narrowed_results try: raw_page = searcher.search_page(parsed_query, page_num, **search_kwargs) except ValueError: if not self.silently_fail: raise return {"results": [], "hits": 0, "spelling_suggestion": None} # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return {"results": [], "hits": 0, "spelling_suggestion": None} results = self._process_results( raw_page, highlight=highlight, query_string=query_string, spelling_query=spelling_query, result_class=result_class, facet_types=facet_types, ) searcher.close() if hasattr(narrow_searcher, "close"): narrow_searcher.close() return results else: if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion( spelling_query ) else: spelling_suggestion = self.create_spelling_suggestion(query_string) else: spelling_suggestion = None return { "results": [], "hits": 0, "spelling_suggestion": spelling_suggestion, } def more_like_this( self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, models=None, limit_to_registered_models=None, result_class=None, **kwargs ): if not self.setup_complete: self.setup() field_name = self.content_field_name narrow_queries = set() narrowed_results = None self.index = self.index.refresh() if limit_to_registered_models is None: limit_to_registered_models = getattr( settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True ) if models and len(models): model_choices = sorted(get_model_ct(model) for model in models) elif limit_to_registered_models: # Using narrow queries, limit the results to only models handled # with the current routers. model_choices = self.build_models_list() else: model_choices = [] if len(model_choices) > 0: if narrow_queries is None: narrow_queries = set() narrow_queries.add( " OR ".join(["%s:%s" % (DJANGO_CT, rm) for rm in model_choices]) ) if additional_query_string and additional_query_string != "*": narrow_queries.add(additional_query_string) narrow_searcher = None if narrow_queries is not None: # Potentially expensive? I don't see another way to do it in Whoosh... narrow_searcher = self.index.searcher() for nq in narrow_queries: recent_narrowed_results = narrow_searcher.search( self.parser.parse(force_str(nq)), limit=None ) if len(recent_narrowed_results) <= 0: return {"results": [], "hits": 0} if narrowed_results: narrowed_results.filter(recent_narrowed_results) else: narrowed_results = recent_narrowed_results page_num, page_length = self.calculate_page(start_offset, end_offset) self.index = self.index.refresh() raw_results = EmptyResults() searcher = None if self.index.doc_count(): query = "%s:%s" % (ID, get_identifier(model_instance)) searcher = self.index.searcher() parsed_query = self.parser.parse(query) results = searcher.search(parsed_query) if len(results): raw_results = results[0].more_like_this(field_name, top=end_offset) # Handle the case where the results have been narrowed. if narrowed_results is not None and hasattr(raw_results, "filter"): raw_results.filter(narrowed_results) try: raw_page = ResultsPage(raw_results, page_num, page_length) except ValueError: if not self.silently_fail: raise return {"results": [], "hits": 0, "spelling_suggestion": None} # Because as of Whoosh 2.5.1, it will return the wrong page of # results if you request something too high. :( if raw_page.pagenum < page_num: return {"results": [], "hits": 0, "spelling_suggestion": None} results = self._process_results(raw_page, result_class=result_class) if searcher: searcher.close() if hasattr(narrow_searcher, "close"): narrow_searcher.close() return results def _process_results( self, raw_page, highlight=False, query_string="", spelling_query=None, result_class=None, facet_types=None, ): from haystack import connections results = [] # It's important to grab the hits first before slicing. Otherwise, this # can cause pagination failures. hits = len(raw_page) if result_class is None: result_class = SearchResult spelling_suggestion = None unified_index = connections[self.connection_alias].get_unified_index() indexed_models = unified_index.get_indexed_models() facets = {} if facet_types: facets = { "fields": {}, "dates": {}, "queries": {}, } for facet_fieldname in raw_page.results.facet_names(): group = raw_page.results.groups(facet_fieldname) facet_type = facet_types[facet_fieldname] # Extract None item for later processing, if present. none_item = group.pop(None, None) lst = facets[facet_type][facet_fieldname] = sorted( group.items(), key=(lambda itm: (-itm[1], itm[0])) ) if none_item is not None: # Inject None item back into the results. none_entry = (None, none_item) if not lst or lst[-1][1] >= none_item: lst.append(none_entry) else: for i, value in enumerate(lst): if value[1] < none_item: lst.insert(i, none_entry) break for doc_offset, raw_result in enumerate(raw_page): score = raw_page.score(doc_offset) or 0 app_label, model_name = raw_result[DJANGO_CT].split(".") additional_fields = {} model = haystack_get_model(app_label, model_name) if model and model in indexed_models: for key, value in raw_result.items(): index = unified_index.get_index(model) string_key = str(key) if string_key in index.fields and hasattr( index.fields[string_key], "convert" ): # Special-cased due to the nature of KEYWORD fields. if index.fields[string_key].is_multivalued: if value is None or len(value) == 0: additional_fields[string_key] = [] else: additional_fields[string_key] = value.split(",") else: additional_fields[string_key] = index.fields[ string_key ].convert(value) else: additional_fields[string_key] = self._to_python(value) del additional_fields[DJANGO_CT] del additional_fields[DJANGO_ID] if highlight: sa = StemmingAnalyzer() formatter = WhooshHtmlFormatter("em") terms = [token.text for token in sa(query_string)] whoosh_result = whoosh_highlight( additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(), formatter, ) additional_fields["highlighted"] = { self.content_field_name: [whoosh_result] } result = result_class( app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields ) results.append(result) else: hits -= 1 if self.include_spelling: if spelling_query: spelling_suggestion = self.create_spelling_suggestion(spelling_query) else: spelling_suggestion = self.create_spelling_suggestion(query_string) return { "results": results, "hits": hits, "facets": facets, "spelling_suggestion": spelling_suggestion, } def create_spelling_suggestion(self, query_string): spelling_suggestion = None reader = self.index.reader() corrector = reader.corrector(self.content_field_name) cleaned_query = force_str(query_string) if not query_string: return spelling_suggestion # Clean the string. for rev_word in self.RESERVED_WORDS: cleaned_query = cleaned_query.replace(rev_word, "") for rev_char in self.RESERVED_CHARACTERS: cleaned_query = cleaned_query.replace(rev_char, "") # Break it down. query_words = cleaned_query.split() suggested_words = [] for word in query_words: suggestions = corrector.suggest(word, limit=1) if len(suggestions) > 0: suggested_words.append(suggestions[0]) spelling_suggestion = " ".join(suggested_words) return spelling_suggestion def _from_python(self, value): """ Converts Python values to a string for Whoosh. Code courtesy of pysolr. """ if hasattr(value, "strftime"): if not hasattr(value, "hour"): value = datetime(value.year, value.month, value.day, 0, 0, 0) elif isinstance(value, bool): if value: value = "true" else: value = "false" elif isinstance(value, (list, tuple)): value = ",".join([force_str(v) for v in value]) elif isinstance(value, (int, float)): # Leave it alone. pass else: value = force_str(value) return value def _to_python(self, value): """ Converts values from Whoosh to native Python values. A port of the same method in pysolr, as they deal with data the same way. """ if value == "true": return True elif value == "false": return False if value and isinstance(value, str): possible_datetime = DATETIME_REGEX.search(value) if possible_datetime: date_values = possible_datetime.groupdict() for dk, dv in date_values.items(): date_values[dk] = int(dv) return datetime( date_values["year"], date_values["month"], date_values["day"], date_values["hour"], date_values["minute"], date_values["second"], ) try: # Attempt to use json to load the values. converted_value = json.loads(value) # Try to handle most built-in types. if isinstance( converted_value, (list, tuple, set, dict, int, float, complex), ): return converted_value except Exception: # If it fails (SyntaxError or its ilk) or we don't trust it, # continue on. pass return value
] schema = fields.Schema(keyword=fields.TEXT(stored=True), index=fields.ID(stored=True), content=fields.TEXT(stored=True)) if not os.path.exists("index"): os.mkdir('index') ix = index.create_in('index', schema) ix = index.open_dir('index') writer = ix.writer() # writer.add_document(keyword='my document', content='this is my document') # writer.add_document(keyword='my second document', content='this is my second document') for li, line in enumerate(lines): for word in list(jieba.cut(line)): print(word) writer.add_document(keyword=word, content=line, index=str(li)) writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser('keyword', ix.schema).parse('老人') result = searcher.search(query) # print("=======================" + result) for res in result: print(res)
args_dict = comand_line_set() index_path = args_dict.get("indexpath") logpath = args_dict.get("logpath") user_dict_path = args_dict.get("userdictpath") comp_dict_path = args_dict.get("compdictpath") stop_word_path = args_dict.get("stopwordpath") modelpath = args_dict.get("modelpath") get_title_number = args_dict.get("titlenumber") get_similar_number = args_dict.get("similarnumber") gl.write_log(logpath, 'info', "\n\n") loginfo = ' word retrieval service starting...' gl.write_log(logpath, 'info', loginfo) # preload dicts to save running time tml.load_dicts(user_dict_path, logpath) tml.load_dicts(comp_dict_path, logpath) stopwords = tml.get_stopwords(stop_word_path, logpath) ix = open_dir(index_path) # for read only index_searcher = ix.searcher() query_parser = QueryParser("segwords", schema=ix.schema) loginfo = ' inverted index file %s has been opened.' % index_path gl.write_log(logpath, 'info', loginfo) # preload similar model to save running time similar_model = tws.load_wordVectors(modelpath) # port number should be changed when demploying app.run(debug=True, host='0.0.0.0', port=8888)
class WhooshBoostBackendTestCase(TestCase): def setUp(self): super(WhooshBoostBackendTestCase, self).setUp() # Stow. temp_path = os.path.join('tmp', 'test_whoosh_query') self.old_whoosh_path = settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'] settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'] = temp_path self.old_ui = connections['whoosh'].get_unified_index() self.ui = UnifiedIndex() self.wmmi = WhooshBoostMockSearchIndex() self.ui.build(indexes=[self.wmmi]) self.sb = connections['whoosh'].get_backend() connections['whoosh']._index = self.ui self.sb.setup() self.raw_whoosh = self.sb.index self.parser = QueryParser(self.sb.content_field_name, schema=self.sb.schema) self.sb.delete_index() self.sample_objs = [] for i in range(1, 5): mock = AFourthMockModel() mock.id = i if i % 2: mock.author = 'daniel' mock.editor = 'david' else: mock.author = 'david' mock.editor = 'daniel' mock.pub_date = date(2009, 2, 25) - timedelta(days=i) self.sample_objs.append(mock) def tearDown(self): if os.path.exists(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']): shutil.rmtree(settings.HAYSTACK_CONNECTIONS['whoosh']['PATH']) settings.HAYSTACK_CONNECTIONS['whoosh']['PATH'] = self.old_whoosh_path connections['whoosh']._index = self.ui super(WhooshBoostBackendTestCase, self).tearDown() @unittest.expectedFailure def test_boost(self): self.sb.update(self.wmmi, self.sample_objs) self.raw_whoosh = self.raw_whoosh.refresh() searcher = self.raw_whoosh.searcher() self.assertEqual( len(searcher.search(self.parser.parse(u'*'), limit=1000)), 2) results = SearchQuerySet('whoosh').filter( SQ(author='daniel') | SQ(editor='daniel')) self.assertEqual([result.id for result in results], [ 'core.afourthmockmodel.1', 'core.afourthmockmodel.3', ]) self.assertEqual(results[0].boost, 1.1)
#imports from whoosh.index import open_dir from whoosh.qparser import QueryParser from pprint import pprint from whoosh.fields import Schema, TEXT #busca busca = input("Busca:") query = busca.split(" ") print(query) try: schema = Schema(titulo=TEXT(stored=True), conteudo_tkn=TEXT(stored=True), conteudo_full=TEXT(stored=True)) parser = QueryParser("conteudo_tkn", schema) myquery = parser.parse("eu OR você") ix = open_dir("whoosh_index") with ix.searcher() as searcher: results = searcher.search(myquery, terms=True) print("Retrieved: ", len(results), ' documents!') for ri in results: print('score:', ri.score, 'of document:', ri.docnum) except: ix.close() print("Search: done. :D")
# coding=utf-8 from whoosh import scoring from whoosh.index import open_dir from whoosh.qparser import QueryParser from whoosh import query idx_dir = 'lagou_idx' ix = open_dir(idx_dir) searcher = ix.searcher() from whoosh import qparser og = qparser.OrGroup.factory(0.9) parser = QueryParser("desc", schema=ix.schema, group=og) # Single field parser. k = u'Java Python city:上海' q = parser.parse(k) results = searcher.search_page(q, 1, pagelen=5) print(u'{0} results found for keyword {1}, {2} returned: '.format( len(results), k, results.scored_length())) for hit in results[:50]: print(hit['id']) print(hit['name']) # print(hit['city']) print(hit['com_name']) print('************')
def do_repository(): type = request.query.get('type') view = request.query.get('view') if view =='property': ix = index.open_dir(index_path1) elif view =='seed': ix = index.open_dir(index_path2) query = QueryParser('target', schema=ix.schema) resultset={} if type.lower()=='node': q = query.parse('node',) with ix.searcher() as s: results = s.search(q,limit=None) mylist=[] d={} for hit in results: d["query_id"]=hit["query_id"] d["content"]=hit["content"] d["graph"]=hit["network_name"] d["results"]=hit["results"] d["target"]=hit["target"] mylist.append(d.copy()) resultset = {"node_queries":mylist} return json.dumps(resultset) elif type.lower()=='edge': q = query.parse('edge',) with ix.searcher() as s: results = s.search(q,limit=None) mylist=[] d={} for hit in results: d["query_id"]=hit["query_id"] d["content"]=hit["content"] d["graph"]=hit["network_name"] d["results"]=hit["results"] d["target"]=hit["target"] mylist.append(d.copy()) resultset = {"edge_queries":mylist} return json.dumps(resultset) elif type.lower()=='all': q = query.parse('node',) with ix.searcher() as s: results = s.search(q,limit=None) mylist1=[] d={} for hit in results: d["query_id"]=hit["query_id"] d["content"]=hit["content"] d["graph"]=hit["network_name"] d["results"]=hit["results"] d["target"]=hit["target"] mylist1.append(d.copy()) q = query.parse('edge',) with ix.searcher() as s: results = s.search(q,limit=None) mylist2=[] d={} for hit in results: d["query_id"]=hit["query_id"] d["content"]=hit["content"] d["graph"]=hit["network_name"] d["results"]=hit["results"] d["target"]=hit["target"] mylist2.append(d.copy()) resultset={} resultset["node_queries"]=mylist1 resultset["edge_queries"]=mylist2 return json.dumps(resultset) else: return HTTPResponse(status=400, body="Invalid Input")
def index(self, repo_name=None): c.repo_name = repo_name c.formated_results = [] c.runtime = '' c.cur_query = request.GET.get('q', None) c.cur_type = request.GET.get('type', 'content') c.cur_search = search_type = { 'content': 'content', 'commit': 'message', 'path': 'path', 'repository': 'repository' }.get(c.cur_type, 'content') index_name = { 'content': IDX_NAME, 'commit': CHGSET_IDX_NAME, 'path': IDX_NAME }.get(c.cur_type, IDX_NAME) schema_defn = { 'content': SCHEMA, 'commit': CHGSETS_SCHEMA, 'path': SCHEMA }.get(c.cur_type, SCHEMA) log.debug('IDX: %s' % index_name) log.debug('SCHEMA: %s' % schema_defn) if c.cur_query: cur_query = c.cur_query.lower() log.debug(cur_query) if c.cur_query: p = safe_int(request.GET.get('page', 1), 1) highlight_items = set() try: idx = open_dir(config['app_conf']['index_dir'], indexname=index_name) searcher = idx.searcher() qp = QueryParser(search_type, schema=schema_defn) if c.repo_name: cur_query = u'repository:%s %s' % (c.repo_name, cur_query) try: query = qp.parse(unicode(cur_query)) # extract words for highlight if isinstance(query, Phrase): highlight_items.update(query.words) elif isinstance(query, Prefix): highlight_items.add(query.text) else: for i in query.all_terms(): if i[0] in ['content', 'message']: highlight_items.add(i[1]) matcher = query.matcher(searcher) log.debug('query: %s' % query) log.debug('hl terms: %s' % highlight_items) results = searcher.search(query) res_ln = len(results) c.runtime = '%s results (%.3f seconds)' % (res_ln, results.runtime) def url_generator(**kw): q = urllib.quote(safe_str(c.cur_query)) return update_params("?q=%s&type=%s" \ % (q, safe_str(c.cur_type)), **kw) repo_location = RepoModel().repos_path c.formated_results = Page(WhooshResultWrapper( search_type, searcher, matcher, highlight_items, repo_location), page=p, item_count=res_ln, items_per_page=10, url=url_generator) except QueryParserError: c.runtime = _('Invalid search query. Try quoting it.') searcher.close() except (EmptyIndexError, IOError): log.error(traceback.format_exc()) log.error('Empty Index data') c.runtime = _('There is no index to search in. ' 'Please run whoosh indexer') except (Exception): log.error(traceback.format_exc()) c.runtime = _('An error occurred during this search operation') # Return a rendered template return render('/search/search.html')
from whoosh.index import create_in from whoosh.fields import * schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = create_in('idx', schema) writer = ix.writer() writer.add_document(title=u'First Document', path=u'/a', content=u'This is the first document we have added!') writer.add_document(title=u'Second Document', path=u'/b', content=u'The second one is even more interesting!') writer.commit() from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse('first') results = searcher.search(query) for hit in results: print(hit)
elif Score == "tf": myscore = scoring.Frequency() elif Score == "multi": myscore = scoring.MultiWeighting(scoring.BM25F(), id=scoring.Frequency(), keys=scoring.TF_IDF()) else: myscore = scoring.BM25F() #---------------Input Query---------------------- schema = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()), content=TEXT(stored=True)) All_Result = [] ix = open_dir(InputIndexDir) sf = torch.nn.Softmax(dim=0) alldata = read_json(input_data_file) with ix.searcher(weighting=myscore) as searcher: parser = QueryParser("title", ix.schema,group=qparser.OrGroup) for item in tqdm(alldata): search_result= {} for keyword,plau_en_mentions in item['plausible_en_mentions'].items(): per_uris = [] per_search_result =[] for (word, score) in plau_en_mentions[0:Pivots_N]: query = parser.parse(word) results = SearchQuery(searcher, query, Search_N) hit_score = [hit.score for hit in results] new_score = sf(torch.Tensor(hit_score)).tolist() new_score = [score * s for s in new_score] hit_title = [hit['title'] for hit in results] hit_content = [hit['content'] for hit in results] per_search_result.extend(list(zip(hit_title, hit_content, new_score))) for c_result in per_search_result:
#!/usr/bin/env python # -*- coding: utf-8 -*- # remove a article from the index index import sys import config input_xml_file_name = config.input_xml_file_name sys.path.append('..') from whoosh.qparser import QueryParser from whoosh.index import open_dir if len(sys.argv) > 1: article_title = sys.argv[1] else: print "Usage remove_from_index.py article" ix = open_dir("index_dir") query = QueryParser("title", ix.schema).parse("'%s'" % unicode(article_title)) ix.delete_by_query(query) ix.writer().commit()
from whoosh.qparser import QueryParser from whoosh import scoring from whoosh.index import open_dir import sys ix = open_dir("index") print("==========SUMMARY SEARCH ENGINE==========") query_str = input("Search: ") print("\nResults: ") with ix.searcher(weighting=scoring.TF_IDF()) as searcher: query = QueryParser("synopsis", schema=ix.schema).parse(query_str) results = searcher.search(query, limit=1000) last = len(results) lastPage = last / 2 / 10.0 if (lastPage % 10 != 0): lastPage = int(lastPage) + 1 else: lastPage = int(lastPage) pageNum = 1 def openRes(results, choose, pageNum): print("Title:", results[choose * 2 - 2]['title']) print("Author:", results[choose * 2 - 2]['author']) print("Date:", results[choose * 2 - 2]['date']) print("Genre:", results[choose * 2 - 2]['genre']) print("Summary:\n", results[choose * 2 - 2]['synopsis']) action = input("Return to search? (y/n):") if (action == 'y' or action == 'Y'):