def search(query_text, expansion=False): with ix.searcher() as searcher: og = qparser.OrGroup.factory(0.8) parser = MultifieldParser(["question", "answer"], ix.schema, group=og) # parser.add_plugin(qparser.FuzzyTermPlugin()) if expansion: query_expanded = '' # get synonyms for the query text syns = get_syns(query_text) for word in query_text.split(): boosted_word = word + '^3' query_expanded = query_expanded + ' ' + boosted_word query_expanded = query_expanded + syns print(f'Search for: {query_expanded}') query = parser.parse(query_expanded) else: query = parser.parse(query_text) print(f'Search for: {query_text}') results = searcher.search(query, limit=1) # runtime = results.runtime # transform the data structure to a list of dictionary so it can be accessed while reader closed answer = '' for passage in results: answer += ''.join([passage['question'], '\n', passage['answer']]) return answer
def search(query_text, expansion=True): with ix.searcher() as searcher: og = qparser.OrGroup.factory(0.8) parser = MultifieldParser(["title", "content"], ix.schema, group=og) # parser.add_plugin(qparser.FuzzyTermPlugin()) if expansion: query_expanded = '' # get synonyms for the query text syns = get_syns(query_text) for word in query_text.split(): boosted_word = word + '^3' query_expanded = query_expanded + ' ' + boosted_word query_expanded = query_expanded + syns print(f'Search for: {query_expanded}') query = parser.parse(query_expanded) else: query = parser.parse(query_text) print(f'Search for: {query_text}') results = searcher.search(query, limit=20) # print(results[0:2]) runtime = results.runtime # transform the data structure to a list of dictionary so it can be accessed while reader closed result_list = [] for passage in results: result_list.append({ 'title': passage['title'], 'url': passage['url'], 'content': passage['content'] }) return result_list, runtime
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None, orderby='-creation_date'): search_engine = get_search_engine('resource') search_result = {} if pagenum < 1: pagenum = 1 with search_engine.searcher() as searcher: parser = MultifieldParser(search_engine.default_search_fields, searcher.schema) user_q = querytext and parser.parse(querytext) or Every() user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs) if querytext and hits.is_empty(): correction_q = parser.parse(querytext) corrected = searcher.correct_query(correction_q, querytext) if corrected.query != correction_q: querytext = corrected.string search_result['corrected_q'] = querytext user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs) search_engine.prepare_search_response(search_result, hits, pagenum, maxresults) search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff) add_absolute_urls(search_result['results'], request) return search_result
def find(cmd, criteria, reindex=False): from whoosh.qparser import MultifieldParser if reindex: _create_index(cmd.cli_ctx) try: ix = _get_index(cmd.cli_ctx) except ValueError: # got a pickle error because the index was written by a different python version # recreate the index and proceed _create_index(cmd.cli_ctx) ix = _get_index(cmd.cli_ctx) qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=_get_schema()) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria) q = qp.parse(expanded_query) with ix.searcher() as searcher: from whoosh.highlight import UppercaseFormatter, ContextFragmenter results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
class Searcher(object): """ Assigned to a Model class as ``search_query``, which enables text-querying. """ def __init__(self, model_class, primary, index, session=None): self.model_class = model_class self.primary = primary self.index = index self.session = session self.searcher = index.searcher() fields = set(index.schema._fields.keys()) - set([self.primary]) self.parser = MultifieldParser(list(fields), index.schema) def __call__(self, query, limit=20, pagenum=1, pagelen=20): session = self.session # When using Flask, get the session from the query attached to the model class. if not session: session = self.model_class.query.session if not pagenum: results = self.index.searcher().search(self.parser.parse(query), limit=limit) else: results = self.index.searcher().search_page( self.parser.parse(query), pagenum=pagenum, pagelen=pagelen) >> log keys = [x[self.primary] for x in results] primary_column = getattr(self.model_class, self.primary) return session.query(self.model_class).filter(primary_column.in_(keys))
def search(searchwords, search_fields , index_file): ix = index.open_dir(index_file) # facet = sorting.FieldFacet("comment_num", reverse=True) searcher = ix.searcher() qp = MultifieldParser(search_fields, schema=ix.schema) results = [] kws = [] if './index/farm_products_index' in index_file: for kw in cut_for_search(searchwords): q = qp.parse(kw) res = list(searcher.search(q, limit=50)) if len(res): results.append(res) kws.append(kw) elif './index/job_index' in index_file: jieba.load_userdict('./data/cut_words.txt') t = np.array(list(cut_for_search(searchwords))) t_p = map(lambda x: "'"+str(x)+"'",t) job, city = get_search_words(','.join(t_p)) s = ' '.join(job)+' ' if job else '' s += ' '.join(city) if city else '' # cuted_s = ' '.join(job) q = qp.parse(s) r = searcher.search(q, terms=True, limit=50) res = list(r) if len(res): results.append(res) kws.append('test') return results,kws
def search(querytext, request, pagenum=1, maxresults=30, staff=False, scope=None, orderby='-creation_date'): search_engine = get_search_engine('resource') search_result = {} if pagenum < 1: pagenum = 1 with search_engine.searcher() as searcher: parser = MultifieldParser(search_engine.default_search_fields, searcher.schema) user_q = querytext and parser.parse(querytext) or Every() user_q, search_kwargs = build_search_kwargs(user_q, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults) + 1, **search_kwargs) if querytext and hits.is_empty(): correction_q = parser.parse(querytext) corrected = searcher.correct_query(correction_q, querytext) if corrected.query != correction_q: querytext = corrected.string search_result['corrected_q'] = querytext user_q, search_kwargs = build_search_kwargs(corrected.query, request, scope, staff, orderby) hits = searcher.search(user_q, limit=(pagenum * maxresults), **search_kwargs) search_engine.prepare_search_response(search_result, hits, pagenum, maxresults) search_result['results'] = add_other_versions(searcher, search_result['results'], request.user, staff) add_absolute_urls(search_result['results'], request) return search_result
def find(criteria, reindex=False): """ Search for Azure CLI commands :param str criteria: Query text to search for. :param bool reindex: Clear the current index and reindex the command modules. :return: :rtype: None """ if reindex: _create_index() ix = _get_index() qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=schema ) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search q = qp.parse(" OR ".join(criteria)) with ix.searcher() as searcher: results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
def find(cmd, criteria, reindex=False): from whoosh.qparser import MultifieldParser if reindex: _create_index(cmd.cli_ctx) try: ix = _get_index(cmd.cli_ctx) except ValueError: # got a pickle error because the index was written by a different python version # recreate the index and proceed _create_index(cmd.cli_ctx) ix = _get_index(cmd.cli_ctx) qp = MultifieldParser( ['cmd_name', 'short_summary', 'long_summary', 'examples'], schema=_get_schema() ) if 'OR' in criteria or 'AND' in criteria: # looks more advanced, let's trust them to make a great query q = qp.parse(" ".join(criteria)) else: # let's help out with some OR's to provide a less restrictive search expanded_query = " OR ".join(criteria) + " OR '{}'".format(criteria) q = qp.parse(expanded_query) with ix.searcher() as searcher: from whoosh.highlight import UppercaseFormatter, ContextFragmenter results = searcher.search(q) results.fragmenter = ContextFragmenter(maxchars=300, surround=200) results.formatter = UppercaseFormatter() for hit in results: _print_hit(hit)
class Searcher(object): """ Assigned to a Model class as ``search_query``, which enables text-querying. """ def __init__(self, model_class, primary, index): self.model_class = model_class self.primary = primary self.index = index self.searcher = index.searcher() fields = set(index.schema._fields.keys()) - set([self.primary]) self.parser = MultifieldParser(list(fields), index.schema) def __call__(self, query, limit=None): """API similar to SQLAlchemy's queries. """ session = self.model_class.query.session results = self.index.searcher().search(self.parser.parse(query), limit=limit) keys = [x[self.primary] for x in results] if not keys: # Dummy request... return session.query(self.model_class).filter("uid = -1") else: primary_column = getattr(self.model_class, self.primary) return session.query(self.model_class).filter(primary_column.in_(keys)) def search(self, query, limit=None): """New API: returns both whoosh records and SA models.""" # TODO: highly suboptimal session = self.model_class.query.session hits = self.index.searcher().search(self.parser.parse(query), limit=limit) for hit in hits: yield (hit, session.query(self.model_class).get(hit[self.primary]))
def search_results(ix, search_query, fields): qpo = MultifieldParser(fields, schema=ix.schema, group=qparser.OrGroup) qpa = MultifieldParser(fields, schema=ix.schema) qo = qpo.parse(search_query) qa = qpa.parse(search_query) data = [] data_index = 0 with ix.searcher() as s: resultsa = s.search(qa) resultso = s.search(qo) for hit in resultsa: data.append(dict(**hit)) context = str() for field in fields: if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context): context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field)) data[data_index]["context"] = context data_index += 1 for hit in resultso: found = False for hita in resultsa: if hit["id"] == hita["id"]: found = True if not found: data.append(dict(**hit)) context = str() for field in fields: if(len(hit.highlights(field)) > 0 and hit.highlights(field) not in context): context += re.sub(r"(\(.*[^\)])",r'\1)', hit.highlights(field)) data[data_index]["context"] = context data_index += 1 return data
def make_search_service(search_text): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir') myindex = open_dir(index_path) qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8}) qstring = search_text q = qp.parse(qstring) results_list = [] myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0)) with myindex.searcher(weighting=myWeighting) as s: results = s.search(q, limit=30, terms=True) #forse cercavi e risultati relativi a corrected = s.correct_query(q, qstring) did_you_mean = str result_for = str if corrected.query != q: if len(results) < 1: results = s.search(qp.parse(corrected.string), limit=30, terms=True) result_for = corrected.string else: did_you_mean = corrected.string #query expansion keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)] if not keywords and keywords == " ": query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords)) results_keyword = s.search(query_keyword, limit=30, terms=True) results.upgrade_and_extend(results_keyword) #sorting key_sort = lambda result: result.score results = sorted(results, key=key_sort, reverse=True) for ris in results: result = {} result['title'] = ris['title'] result['url'] = ris['url'] result['id'] = ris['ID'] result['highlight'] = ris.highlights("textdata") results_list.append(result) #per calcolo precisione e recall id_results = [ris['id'] for ris in results_list[:10]] return { 'search_text': search_text, 'results': results_list, 'did_you_mean': did_you_mean, 'result_for': result_for, 'results_ids': id_results }
def perform_search(self, schema, field: str, query: str, page: int = 1, pagelen: int = 20): """ Performs a query of the index from the given field and query string :param schema: :param field: String. Index field :param query: String. :param page: int. starting page of results to return results from :param pagelen: int. number of results to display per page :return: list. results """ if field is '': # Get All Schema fields fields = self.get_fields(schema=schema) results_dict = {} with self.schemas[schema].searcher() as searcher: last_page = False while not last_page: parser = MultifieldParser(fields, self.schemas[schema].schema) search_query = parser.parse(query) results = searcher.search_page(search_query, page, pagelen) if results.total > 0: for doc in range(results.pagelen): results_dict[results.docnum( doc)] = results.results.fields(doc) last_page = results.is_last_page() page += 1 return results_dict else: results_dict = {} with self.schemas[schema].searcher() as searcher: last_page = False while not last_page: parser = QueryParser(field, self.schemas[schema].schema) search_query = parser.parse(query) results = searcher.search_page(search_query, page, pagelen) if results.total > 0: for doc in range(results.pagelen): results_dict[results.docnum( doc)] = results.results.fields(doc) last_page = results.is_last_page() page += 1 return results_dict
def search(request): indexNewsObject = IndexNews() ix = indexNewsObject.ix if request.method == 'POST': inputQuery = request.POST['inputQuerySearchPage'] request.session['inputQuery'] = inputQuery if inputQuery == '': context = { 'message' : 'لطفا عبارت مورد نظر خود را وارد کنید' } return render(request,'searchPage/searchPage.html',context=context) else: # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context) else: inputQuery = request.session['inputQuery'] # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context)
def search(self, query_list, fields=None): with self.ix.searcher() as searcher: query_list2 = [] for qq in query_list: if qq=='AND' or qq=='OR': query_list2.append(qq) else: query_list2.append(qq.lower()) query_string = " ".join(query_list2) query = None if ":" in query_string: # If the user DOES specify a field, # setting the fields determines what fields # are searched with the free terms (no field) fields = ['title', 'content','owner_name','owner_email','github_user'] query = MultifieldParser(fields, schema=self.ix.schema) est = pytz.timezone('America/New_York') query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow()))) query.add_plugin(GtLtPlugin()) try: query = query.parse(query_string) except: # Because the DateParser plugin is an idiot query_string2 = re.sub(r':(\w+)',':\'\g<1>\'',query_string) try: query = query.parse(query_string2) except: print("parsing query %s failed"%(query_string)) print("parsing query %s also failed"%(query_string2)) query = query.parse('') else: # If the user does not specify a field, # these are the fields that are actually searched fields = ['url','title', 'content','owner_name','owner_email','github_user'] query = MultifieldParser(fields, schema=self.ix.schema) est = pytz.timezone('America/New_York') query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow()))) query.add_plugin(GtLtPlugin()) try: query = query.parse(query_string) except: print("parsing query %s failed"%(query_string)) query = query.parse('') parsed_query = "%s" % query print("query: %s" % parsed_query) results = searcher.search(query, terms=False, scored=True, groupedby="kind") search_result = self.create_search_result(results) return parsed_query, search_result
def search( self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize ): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={ 'name_B': float( tool_name_boost ), 'section_B': float( tool_section_boost ), 'description_B': float( tool_description_boost ), 'labels_B': float( tool_label_boost ), 'stub_B': float( tool_stub_boost ), 'help_B': float( tool_help_boost ) } ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( [ 'name', 'description', 'section', 'help', 'labels', 'stub' ], schema=self.schema ) # Hyphens are wildcards in Whoosh causing bad things if q.find( '-' ) != -1: q = (' ').join( [ token.text for token in self.rex( to_unicode( q ) ) ] ) # Perform tool search with ngrams if set to true in the config file if ( tool_enable_ngram_search is True or tool_enable_ngram_search == "True" ): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter( minsize=int( tool_ngram_minsize ), maxsize=int( tool_ngram_maxsize ) ) ngrams = [ token.text for token in token_analyzer( q ) ] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search( parser.parse( '*' + query + '*' ), limit=float( tool_search_limit ) ) for i, curr_hit in enumerate( curr_hits ): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit[ 'id' ] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[ prev_hit ] = curr_hits.score(i) + hits_with_score[ prev_hit ] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[ curr_hit[ 'id' ] ] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted( hits_with_score.items(), key=lambda x: x[1], reverse=True ) # Return the tool ids return [ item[0] for item in hits_with_score[ 0:int( tool_search_limit ) ] ] else: # Perform the search hits = searcher.search( parser.parse( '*' + q + '*' ), limit=float( tool_search_limit ) ) return [ hit[ 'id' ] for hit in hits ]
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit, tool_enable_ngram_search, tool_ngram_minsize, tool_ngram_maxsize): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher( weighting=BM25F( field_B={'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost)} ) ) # Set query to search name, description, section, help, and labels. parser = MultifieldParser(['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform tool search with ngrams if set to true in the config file if (tool_enable_ngram_search is True or tool_enable_ngram_search == "True"): hits_with_score = {} token_analyzer = StandardAnalyzer() | analysis.NgramFilter(minsize=int(tool_ngram_minsize), maxsize=int(tool_ngram_maxsize)) ngrams = [token.text for token in token_analyzer(q)] for query in ngrams: # Get the tool list with respective scores for each qgram curr_hits = searcher.search(parser.parse('*' + query + '*'), limit=float(tool_search_limit)) for i, curr_hit in enumerate(curr_hits): is_present = False for prev_hit in hits_with_score: # Check if the tool appears again for the next qgram search if curr_hit['id'] == prev_hit: is_present = True # Add the current score with the previous one if the # tool appears again for the next qgram hits_with_score[prev_hit] = curr_hits.score(i) + hits_with_score[prev_hit] # Add the tool if not present to the collection with its score if not is_present: hits_with_score[curr_hit['id']] = curr_hits.score(i) # Sort the results based on aggregated BM25 score in decreasing order of scores hits_with_score = sorted(hits_with_score.items(), key=lambda x: x[1], reverse=True) # Return the tool ids return [item[0] for item in hits_with_score[0:int(tool_search_limit)]] else: # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def generic(idx, qs=None, q=None, limit=5, parser=None, page=1): if qs is q is None: raise ValueError('cannot have a null querystring and query') if parser is None: parser = MultifieldParser( ['title', 'keywords', 'summary', 'content', 'author'], idx.schema, group=OrGroup) # add better date parsing support parser.add_plugin(DateParserPlugin()) parser.remove_plugin_class(WildcardPlugin) with idx.searcher() as search: # generate the Query object if qs: query = parser.parse(qs) else: query = q facet = MultiFacet() facet.add_score() facet.add_field('modified', reverse=True) facet.add_field('title') results = search.search_page(query, pagenum=page, sortedby=facet, pagelen=limit) res = clean_results(idx, results, query) # pagination attributes on `search_page` method res.page_number = results.pagenum # current page number res.page_total = results.pagecount # total pages in results res.offset = results.offset # first result of current page res.pagelen = results.pagelen # the number of max results per page return res
def search_documents(filter): results = None # Check for existing index dir_path = os.path.join(DATA_DIR, 'index') if not os.path.exists(dir_path) or not Index.exists_in(dir_path): return None index = Index.open_dir(dir_path) if filter.startswith('tags:'): fields = ['tags'] filter = filter[5:] else: fields = ['path', 'content'] parser = MultifieldParser(fields, schema=index.schema) search_query = parser.parse(unicode(filter)) # Try documents search try: searcher = index.searcher(closereader=False) return searcher.search(search_query, collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')], collapse_order=sorting.FieldFacet('revision', reverse=True), sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)] ) finally: searcher.close() return results
def search(page): search = request.args['q'] storage = FileStorage(conf.INDEX_DIR) index = storage.open_index(indexname=conf.INDEX_NAME) qp = MultifieldParser(['title', 'text', 'tags'], schema=index.schema) q = qp.parse(search) results = [] with index.searcher() as searcher: results = searcher.search_page(q, page, pagelen=conf.PAGE_SIZE) # Get real posts post_ids = ",".join(["'%s'" % p['post_id'] for p in results]) if post_ids: ghost = get_melmelboo_connection() with ghost.cursor() as ghost_cur: ghost_cur.execute("SELECT title, feature_image, html, slug " "FROM posts WHERE id IN (%s)" % post_ids) posts = [{ 'type': "post", 'title': i[0], 'image': i[1], 'excerpt': excerpt(i[2]), 'url': "/blog/" + i[3] } for i in ghost_cur.fetchall()] ghost.close() else: posts = [] return render_template("search.html", posts=posts, search=search)
def search(keyword): logging.debug('searching for: %s ... ',keyword.split()) """Search deal from indexed file""" ix = open_dir('C:\crawlData\indexed') """Open indexed file""" qp = MultifieldParser(["title", "link"], schema=ix.schema) # qp.remove_plugin_class(PhrasePlugin) # qp.add_plugin(SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?")) # qp.add_plugin(PhrasePlugin(expr='"(?P<text>.*?)"(~(?P<slop>[1-9][0-9]*))?')) q = qp.parse(keyword) with ix.searcher() as s: results = s.search(q, limit=10) items = '[' for hit in results: items = (items + '{"title":"' + string.replace(hit['title'],'\n',' ') + '","link":"' + hit['link'] + '","pSale":"' + string.replace(hit['pSale'],'\n',' ') + '","pRegular":"' + string.replace(hit['pRegular'],'\n',' ') + '","img":"' + hit['img'] + '"},') # print('matched term: ',hit.matched_terms()) items = items[0:len(items)-1] + ']' # print(items) return items
def parse(text, schema=SCHEMA): """ parse(text[, schema=SCHEMA]) Analisa e trata o texto em ``text`` de acordo com o ``schema`` do índice de documentos. .. code-block:: python >>> from storyline.engine.query import parse >>> from storyline.engine.schema import get_schema >>> >>> SCHEMA = get_schema() >>> parse("Mestre", SCHEMA) Or([Term('title', u'mestr'), Term('content', u'mestr')]) :param text: Consulta feita pelo usuário. :type text: str :param schema: Schema do índice de documentos. :type schema: Schema :returns: Query com termos e operadores. """ try: from whoosh.qparser import MultifieldParser except ImportError: print "Ocorreu um erro na importação do módulo whoosh.qparser." qp = MultifieldParser(["title", "content"], schema, None) return qp.parse(text)
def __call__(self, query, limit=None, fields=None, or_=False): if fields is None: fields = self._all_fields group = OrGroup if or_ else AndGroup parser = MultifieldParser(fields, self._index.schema, group=group) return self._index.searcher().search(parser.parse(query), limit=limit)
def search_whoosh_index_headline(query, paths): if not paths: return [] ix = get_whoosh_index() parser = MultifieldParser(['content', 'title', 'abstract'], ix.schema) q = parser.parse(query) allow_q = Or([Term('path', path) for path in paths]) res = [] with ix.searcher() as searcher: results = searcher.search(q, filter=allow_q, limit=len(paths), terms=True) for hit in results: res.append({ # 'title': hit['title'], 'short_url': hit['path'], 'highlights': u' [...] '.join( filter(None, [ hit.highlights("title", top=5), hit.highlights("abstract", top=5), hit.highlights("content", top=5) ])) }) return res
def search(querystring, language_code): ix = LanguageIndex(settings.WHOOSH_INDEX_PATH, language_code, _get_schema()).load() # parser = QueryParser('content', ix.schema) parser = MultifieldParser(['title', 'keywords', 'content'], ix.schema) # fieldboosts={'title':5, 'keywords':4, 'content':1}) parser.remove_plugin_class(WildcardPlugin) # remove unused feature for better performance query = parser.parse(querystring) # print(parser, query, querystring) result = { 'results': [], } with ix.searcher() as searcher: results = searcher.search(query) # print(results) # import pdb; pdb.set_trace() # collect results for hit in results: my_hit = {} # my_hit['pos'] = hit.pos # my_hit['rank'] = hit.rank # my_hit['docnum'] = hit.docnum my_hit['score'] = hit.score my_hit['object'] = Article.objects.get(code=hit.fields()['code']) #.exclude(published=False).exclude(release_date__gte=datetime.today()) # my_hit['object']['is_visible'] = True result['results'].append(my_hit) # print(hit.pos, hit.rank, hit.docnum, hit.score, hit) return result
class Index: def __init__(self, path='~/Music/iTunes/iTunes Music Library.xml', folder='~/Library/Application Support/Share my tunes'): self.path = os.path.expanduser(path) self.schema = Schema( trackId = ID(stored=True), name=TEXT(stored=True), artist=TEXT(stored=True), album=TEXT(stored=True), genre=KEYWORD(stored=True), location=STORED, trackNumber=STORED, bitRate=ID(stored=True), artwork=KEYWORD(stored=True) ) self.parser = MultifieldParser(["name", "album", "artist"], schema = self.schema) self.folder = "%s/index" % os.path.expanduser(folder) self.empty = not whoosh.index.exists_in(self.folder) self.ix = None def index(self): if self.empty: if not os.path.exists(self.folder): os.makedirs(self.folder) st = FileStorage(self.folder) ix = st.create_index(self.schema) w = ix.writer() w.add_document(name = u"beuha") pipe = file.ID3Filter() #[TODO] using itunes info for artwork? cpt = 0 for track in pipe(ItunesParser(self.path)): if track['album'] != None : album = track['album'].encode('ascii', 'ignore') else: album = "" #print track['artwork'], "[%s]" % album, track['name'].encode('ascii', 'ignore') if cpt % 20 == 0: print "\n%i " %cpt, print '#', #print track['album'], track['name'] w.add_document( trackId = track['trackId'], name=track['name'] ,artist=track['artist'], album=track['album'], genre=track['genre'], location=track['location'], artwork=boolean(track['artwork']), trackNumber=track['trackNumber'], bitRate=track['bitRate'] ) #if cpt % 100 == 1: # w.commit() cpt += 1 print "\n\n%i tracks indexed" % cpt w.commit() ix.optimize() ix.close() else : print "already indexed" def query(self, query): if self.ix == None: self.ix = FileStorage(self.folder).open_index() q = self.parser.parse(query) return self.ix.searcher().search(q, sortedby=("album", "name"), limit=None)
def getdocs(): params = dict(request.args.items()) search_terms = params['NPS'].split(quails.DELIMITER) try: ix = index.open_dir("indexQ") except: return jsonify(failure="Index not found. Ensure that index exists and tries again.") qp = MultifieldParser(["title","body"], schema=ix.schema) queries = [] for term in search_terms: queries.append(qp.parse(term)) docs = OrderedDict() hit_list = [] with ix.searcher() as searcher: for query in queries: results=searcher.search(query) for result in results: hit_list.append((str(query),result['title'])) return jsonify(results=hit_list)
class FTSSearcher(object): """用于检索 """ def __init__(self, storage=default_storage): self._fragmenter_maxchars = 70 self._fragmenter_surround = 70 self._formatter = MarkFormatter() schema = Schema(news_id=ID(unique=True, stored=True), title=TEXT(field_boost=2.0, analyzer=analyzer), content=TEXT(analyzer=analyzer)) self._ix = storage.open_index(schema=schema) self._parser = MultifieldParser(["title", "content"], self._ix.schema) self._searcher = self._ix.searcher() def search(self, query_string, limit=10): """搜索文件 """ # refresh searcher query_string = util.str2unicode(query_string) query = self._parser.parse(query_string) search_results = self._searcher.search(query, limit=limit) # 设置highlight属性 search_results.formatter = self._formatter search_results.fragmenter.maxchars = self._fragmenter_maxchars search_results.fragmenter.surround = self._fragmenter_surround return search_results def close(self): self._searcher.close()
def keywords(request): query = request.GET.get('q', '') if not query: return render(request, 'search/keywords.html', {'page_name': 'search.keywords'}) qtext = get_tokenized_query(query) print qtext idx_dir = os.path.join(settings.BASE_DIR, 'search/lagou_idx') ix = open_dir(idx_dir) searcher = ix.searcher() parser = MultifieldParser(["name", "com_name", 'city'], schema=ix.schema) q = parser.parse(qtext) plen = 100 results = searcher.search(q, limit=plen) total = len(results) got = results.scored_length() numterms = 100 if got < 10: numterms = 10 elif got < 100: numterms = 50 keywords = [(kw, score) for kw, score in results.key_terms("desc", docs=got, numterms=numterms)] return render(request, 'search/keywords.html', {'page_name': 'search.keywords', 'query': query, 'total': total, 'got': got, 'keywords': keywords, })
def search_whoosh_index(query, offset=0, limit=10, *args, **kwargs): ix = get_whoosh_index() parser = MultifieldParser( ['content', 'authors', 'tags', 'title', 'abstract'], ix.schema) # user query q = parser.parse(query) if not query: q = Every() print 'arch' allow_q = And([Term(key, value) for key, value in kwargs.iteritems()]) # parse remaining args res = [] count = 0 offset = int(offset) limit = int(limit) right = offset + limit # restrict_q = Or([Term("path", u'%s' % d.id) for d in qs]) #print 'query', q, allow_q, kwargs with ix.searcher() as searcher: # From WHOOSH documentation: # > Currently, searching for page 100 with pagelen of 10 takes the same amount of time as using Searcher.search() # to find the first 1000 results results = searcher.search(q, filter=allow_q, limit=right, terms=True) count = len(results) for hit in list(results)[offset:]: res.append({ # 'title': hit['title'], 'short_url': hit['path'], 'highlights': hit.highlights("content", top=5) }) # @todo filter by empty highlight strings return {'results': res, 'count': count}
def search(q, limit=None): # q = str(q) ix = open_dir(DIRECTORY, NAME) with ix.searcher() as searcher: qp = MultifieldParser(fieldnames=['title', 'author', 'tags', 'notes', 'text', 'source', # 'cached', 'year'], fieldboosts={'title': 7, 'year': 6, 'author': 10, 'tags': 4, 'notes': 2, 'text': 1}, schema=ix.schema) # Whoosh chokes on queries with stop words, so remove them. q = remove_stopwords(q) q = qp.parse(q) for hit in searcher.search(q, limit=limit): yield hit
def search_commodity(): from shop import app ix = open_dir(app.config.get("INDEX_DIR")) searcher = ix.searcher() mparser = MultifieldParser(["content", "title"], schema=ix.schema) query_raw = request.args.get('q', '') if query_raw: query = mparser.parse(unicode(query_raw.lower())) results = searcher.search(query) result_id = [] for result in results: result_id.append(int(result['id'])) result_id = list(set(result_id)) wq = None for rid in result_id: if not wq: wq = Q(id=rid) else: wq |= Q(id=rid) if wq: coms = Commodity.select().where(wq) else: coms = [] else: coms = Commodity.select() category = int(request.args.get('c', '0')) if category and category != 1: coms = [c for c in coms if c.is_category(category)] return render_template('core/com_list.html', commodities=coms)
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [PhrasePlugin(), GroupPlugin(), OperatorsPlugin()] for key, value in keywords.items(multi=True): fields = [ spec.key for spec in composer.fields.values() if key in spec.scopes ] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser(fields, schema=composer.schema, plugins=text_plugins) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def search(index_name, text, scope=None, limit=20): index_dir = get_index_path(index_name) ix = open_dir(index_dir) results = None out = [] with ix.searcher() as searcher: parser = MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(text) filter_scoped = None if scope: filter_scoped = Prefix("path", scope) results = searcher.search(query, limit=limit, filter=filter_scoped) for r in results: title_highlights = r.highlights("title") content_highlights = r.highlights("content") out.append( frappe._dict( title=r["title"], path=r["path"], title_highlights=title_highlights, content_highlights=content_highlights, )) return out
def sample(entry): try: qp = MultifieldParser(["brewery", "beer"], schema=ix.schema) q = qp.parse(entry) #q = qp.parse(u'sculpin') with ix.searcher() as s: results = s.search(q) #This is to limit search hits to only hits with beer, brewery, and id results = [ results[r] for r in range(len(results)) if len(results[r]) >= 3 ] liner = [] ids = [] for x in range(len(results)): be = results[x]['beer'] br = results[x]['brewery'] bi = results[x]['beer_id'] new_line = str(be + ', ' + br) ids.append(bi) liner.append(new_line) lines = [] for d in range(len(results)): ent = {'label': liner[d], 'value': ids[d]} lines.append(ent) #return '{}'.format(lines) return lines except Exception as rep: return '{}'.format(rep)
def page(self, page, limit): with self.engine.index.searcher() as searcher: parser = MultifieldParser( self.engine.search_fields, schema = self.engine.index.schema, ) parser.add_plugin(GtLtPlugin()) parser.add_plugin(PhrasePlugin()) parser.add_plugin(FieldsPlugin()) #parser.remove_plugin_class(WildcardPlugin) #parser.add_plugin(WildcardPlugin()) parser.add_plugin(PrefixPlugin()) whoosh_query = parser.parse(self.query.toString(self.engine)) #print "============" + str(whoosh_query) results = searcher.search_page(whoosh_query, page, limit, sortedby = self.order) self.rows = results.total _results = [] doc_class = self.engine.database.document for result in results: doc = doc_class(data = {field: result.get(field, None) for field in self.engine.stored_fields}, restore = True) _results.append(doc) return _results
def search(self, text, scope=None, limit=20): """Search from the current index Args: text (str): String to search for scope (str, optional): Scope to limit the search. Defaults to None. limit (int, optional): Limit number of search results. Defaults to 20. Returns: [List(_dict)]: Search results """ ix = self.get_index() results = None out = [] with ix.searcher() as searcher: parser = MultifieldParser(["title", "content"], ix.schema) parser.remove_plugin_class(FieldsPlugin) parser.remove_plugin_class(WildcardPlugin) query = parser.parse(text) filter_scoped = None if scope: filter_scoped = Prefix(self.id, scope) results = searcher.search(query, limit=limit, filter=filter_scoped) for r in results: out.append(self.parse_result(r)) return out
def MRR(queries, ground, score, ix): MRR_table = {} fields = ix.schema.names() # opening the schema fields fields.remove( 'id') # we are not interested in the 'id' field for the MRR evaluation RR_sum = 0 for query in queries: RR = 0 qp = MultifieldParser(fields, ix.schema) parsed_query = qp.parse(queries[query]) searcher = ix.searcher(weighting=score) results = searcher.search(parsed_query, limit=None) for doc in results: if int(doc['id']) in ground[ query]: # we just stop the loop for the first relevant result K = doc.rank + 1 # and we evaluate the single RR RR = 1 / K break RR_sum += RR # summing the RR for the singular query MRR = RR_sum / len( ground) # evaluating the average on the available queries return MRR
def search(self, query_str, limit=30, html=True, description=True, comments=False, search_comments=True, highlight=True): index = self._get_index() searcher = index.searcher() fields = ["summary", "description"] if search_comments: fields.append("comments_str") qp = MultifieldParser(fields, schema=index.schema) query = qp.parse(query_str) results = searcher.search(query, limit=limit) results.formatter = AnsiColorFormatter() results.fragmenter = WholeFragmenter() self.report(results) for hit in results: ticket = Ticket.get_by_id(hit["key"]) text = Issue(ticket.data).to_string(with_description=description, with_comments=comments) if not html: text = self._html_to_text.handle(text) text = text.strip() if highlight and description: highlighted = hit.highlights("description", text=text) if highlighted: text = highlighted self.report(text) if description or comments: self.report("-" * 80)
def search(self, q, tool_name_boost, tool_section_boost, tool_description_boost, tool_label_boost, tool_stub_boost, tool_help_boost, tool_search_limit): """ Perform search on the in-memory index. Weight in the given boosts. """ # Change field boosts for searcher searcher = self.index.searcher(weighting=BM25F( field_B={ 'name_B': float(tool_name_boost), 'section_B': float(tool_section_boost), 'description_B': float(tool_description_boost), 'labels_B': float(tool_label_boost), 'stub_B': float(tool_stub_boost), 'help_B': float(tool_help_boost) })) # Set query to search name, description, section, help, and labels. parser = MultifieldParser( ['name', 'description', 'section', 'help', 'labels', 'stub'], schema=self.schema) # Hyphens are wildcards in Whoosh causing bad things if q.find('-') != -1: q = (' ').join([token.text for token in self.rex(to_unicode(q))]) # Perform the search hits = searcher.search(parser.parse('*' + q + '*'), limit=float(tool_search_limit)) return [hit['id'] for hit in hits]
def job_details(request, pk='10'): results = [] ix = open_dir(settings.WHOOSH_INDEX) parser = MultifieldParser([ "jobtitle", "company", "city", "state", "country", "source", "date", "JD", "url", "latitude", "longitude", "relative_time" ], ix.schema) try: query = parser.parse("job_id:" + pk) print(query) except: # don't show the user weird errors only because we don't # understand the query. # parser.parse("") would return None query = None if query is not None: searcher = ix.searcher() results = searcher.search(query) print(len(results)) for result in results: print(result) searchQuery = request.session["searchQuery"] return render(request, 'job_details.html', { 'query': pk, 'results': results, 'searchQuery': searchQuery })
def search(self, query): """ general search function for a query string """ hit_docs = [] index_dir = "D:/bjstinfo_index" # deprecated. we should use variable or configure file. if not os.path.exists(index_dir): print "Error: indexer doesn't exist!" sys.exit(1) ix = index.open_dir(index_dir) # For keywords query, we search multi-fields of documents as: # Title, Keywords, Abstract. give the query-time fieldsboost: # {"Title": 1.2, "Keywords": 1.1, "Abstract": 1.0} query_fields = ['Title', 'Keywords', 'Abstract'] field_boosts = {'Title':1.2, 'Keywords':1.1, 'Abstract':1.0} qp = MultifieldParser(query_fields, schema=ix.schema, fieldboosts=field_boosts) q = qp.parse(query) with ix.searcher() as s: results = s.search(q, limit=50, terms=True) # my_cf = ContextFragmenter(maxchars=100, surround=30) #custome fragmenter. # results.fragmenter = my_cf # my_score = StandarDeviationScorer(my_cf) #custome scorer. # results.scorer = my_score # results.formatter = HtmlFormatter() for hit in results: # print hit.fields() hit_docs.append(hit.fields()) # why just cannot implement the highlight function? # print hit.highlights('Abstract', top=20) return hit_docs
def indexquery(name, www): if name == None: return [] #print("Name: %s" % name) ix = index.open_dir("/var/www/restnames/index") qp = MultifieldParser([ "commonname", "database", "tags", "name", "name_part", "country", "project", "url" ], schema=ix.schema, termclass=FuzzyTerm) qp.add_plugin(qparser.FuzzyTermPlugin()) q = qp.parse(name) #q = Every() tempvar = [] with ix.searcher() as searcher: results = searcher.search(q, limit=None) for hit in results: tempvar.append({ 'name': hit["name"], 'commonname': hit["commonname"], 'url': hit["url"] }) if not www: return tempvar else: response = Response( render_template("searchresults.html", resultlist=tempvar)) response.headers['content-type'] = 'text/html' return response
def search(self, queryEntered, pageNum): id = list() Name = list() Genre = list() Yearofrelease = list() Description = list() Rating = list() ImdbUrl = list() Votes = list() with self.indexer.searcher() as search: # fileds to be parsed are Name, Description, Genre, and Year of Release query = MultifieldParser([ 'Name', 'Description', 'Genre', 'Yearofrelease', 'Rating', 'ImdbUrl', 'Votes' ], schema=self.indexer.schema, termclass=Variations) query = query.parse(queryEntered) results = search.search_page(query, pagenum=pageNum) i = 0 for x in results: id.append(i) Name.append(x['Name']) Description.append(x['Description']) Genre.append(x['Genre']) Yearofrelease.append(x['Yearofrelease']) Rating.append(x['Rating']) ImdbUrl.append(x['ImdbUrl']) Votes.append(x['Votes']) i = i + 1 return id, Name, Description, Genre, Yearofrelease, Rating, ImdbUrl, Votes
def live_search(self, query): """live search on ngram field""" with self.ix.\ searcher(weighting=scoring.BM25F(title_B=2)) as searcher: qp = MultifieldParser(self.live_search_field + self.search_field, schema=self.ix.schema) q = qp.parse(query) results = searcher.search(q, limit=25).copy() res = {'estimated_length': results.estimated_length(), 'scored_length': results.scored_length(), 'runtime': results.runtime, 'list': []} for i, r in enumerate(results): if 'id' in r and 'space' in r: url = url_for('document.view', space=r['space'], doc_id=r['id']) else: url = None res['list'].append({'id': r.get('id', ''), 'space': r.get('space', ''), 'title': r.get('title', ''), 'rank': r.rank, 'url': url, 'score': results.score(i)}) return res
def __call__(self, query, limit=None, fields=None, or_=False): if fields is None: fields = self._all_fields group = OrGroup if or_ else AndGroup parser = MultifieldParser(fields, self._index.schema, group=group) return self._index.searcher().search(parser.parse(query), limit=limit)
def search(self, term): qp = MultifieldParser(['name', 'tags'], schema=SCHEMA) q = qp.parse(term) searcher = self.ix.searcher() results = searcher.search(q, limit=None) return results
def getdocs(): params = dict(request.args.items()) search_terms = params['NPS'].split(quails.DELIMITER) try: ix = index.open_dir("indexQ") except: return jsonify( failure= "Index not found. Ensure that index exists and tries again.") qp = MultifieldParser(["title", "body"], schema=ix.schema) queries = [] for term in search_terms: queries.append(qp.parse(term)) docs = OrderedDict() hit_list = [] with ix.searcher() as searcher: for query in queries: results = searcher.search(query) for result in results: hit_list.append((str(query), result['title'])) return jsonify(results=hit_list)
def build_keywords_query(keywords): """ Build parsers for a query. :param MultiDict keywords: The search texts keyed by scope key. If empty, the query will match every documents. """ queries = [] if keywords: composer = current_app.config['KERKO_COMPOSER'] text_plugins = [ plugins.PhrasePlugin(), plugins.GroupPlugin(), plugins.OperatorsPlugin( And=r"(?<=\s)" + re.escape(gettext("AND")) + r"(?=\s)", Or=r"(?<=\s)" + re.escape(gettext("OR")) + r"(?=\s)", Not=r"(^|(?<=(\s|[()])))" + re.escape(gettext("NOT")) + r"(?=\s)", AndNot=None, AndMaybe=None, Require=None ), plugins.BoostPlugin(), ] for key, value in keywords.items(multi=True): fields = [spec.key for spec in composer.fields.values() if key in spec.scopes] if not fields: raise KeyError # No known field for that scope key. parser = MultifieldParser( fields, schema=composer.schema, plugins=text_plugins ) queries.append(parser.parse(value)) else: queries.append(Every()) return And(queries)
def search(self, term): if not self.index: self.load_index() parser = MultifieldParser(("body", "title", "tags"), schema=self.schema) query = parser.parse(term) results = self.searcher.search(query, limit=100) # , sortedby="date", reverse=True) return results
def search(self, query, *args, **kwargs): parser = MultifieldParser(fieldnames=('content','title','headings','url'), schema=self.ix.schema, fieldboosts={'content':1,'title':2,'headings':3,'url':1}) qry = parser.parse(query) search = self.ix.searcher() # with self.ix.searcher() as searcher: return search.search_page(qry, *args, **kwargs)
def search(self, query_string, index, parser=None, **kwargs): index = base._resolve_index(index) if parser is None: parser = MultifieldParser(fieldnames=index.get_searchable_fieldnames(), schema=index.get_schema()) query = parser.parse(query_string) return self._search(query, index, **kwargs)
def search( self, trans, search_term, page, page_size, boosts ): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ tool_index_dir = os.path.join( trans.app.config.whoosh_index_dir, 'tools' ) index_exists = whoosh.index.exists_in( tool_index_dir ) if index_exists: index = whoosh.index.open_dir( tool_index_dir ) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. tool_weighting = scoring.BM25F( field_B={ 'name_B' : boosts.tool_name_boost, 'description_B' : boosts.tool_description_boost, 'help_B' : boosts.tool_help_boost, 'repo_owner_username_B' : boosts.tool_repo_owner_username_boost } ) searcher = index.searcher( weighting=tool_weighting ) parser = MultifieldParser( [ 'name', 'description', 'help', 'repo_owner_username' ], schema=tool_schema ) user_query = parser.parse( '*' + search_term + '*' ) try: hits = searcher.search_page( user_query, page, pagelen=page_size, terms=True ) except ValueError: raise ObjectNotFound( 'The requested page does not exist.' ) log.debug( 'searching tools for: #' + str( search_term ) ) log.debug( 'total hits: ' + str( len( hits ) ) ) log.debug( 'scored hits: ' + str( hits.scored_length() ) ) results = {} results[ 'total_results'] = str( len( hits ) ) results[ 'page'] = str( page ) results[ 'page_size'] = str( page_size ) results[ 'hits' ] = [] for hit in hits: hit_dict = {} hit_dict[ 'id' ] = hit.get( 'id' ) hit_dict[ 'repo_owner_username' ] = hit.get( 'repo_owner_username' ) hit_dict[ 'repo_name' ] = hit.get( 'repo_name' ) hit_dict[ 'name' ] = hit.get( 'name' ) hit_dict[ 'description' ] = hit.get( 'description' ) results[ 'hits' ].append( {'tool': hit_dict, 'matched_terms': hit.matched_terms(), 'score': hit.score } ) return results finally: searcher.close() else: raise exceptions.InternalServerError( 'The search index file is missing.' )
def search(self, search_key): ix = self.getIndex() parser = MultifieldParser(["book", "chapter", "verse", "verse_text"], schema=ix.schema) query = parser.parse(search_key) searcher = ix.searcher() result = searcher.search(query, limit=1000) return self.formatSearchResult(result)
def search(ix, query_string, sortedby=None, limit=10): mp = MultifieldParser(["title", "summary"], schema=ix.schema) s = ix.searcher() keywords = split_keywords(query_string) user_q = mp.parse(' OR '.join(keywords)) # TODO: add query filter results = s.search(user_q, sortedby=sortedby, limit=limit) return results
def search( self, query, return_attribute='id' ): # Change field boosts for searcher to place more weight on title, description than help. searcher = self.index.searcher( \ weighting=BM25F( field_B={ 'title_B' : 3, 'description_B' : 2, 'help_B' : 1 } \ ) ) # Set query to search title, description, and help. parser = MultifieldParser( [ 'title', 'description', 'help' ], schema = schema ) results = searcher.search( parser.parse( query ) ) return [ result[ return_attribute ] for result in results ]
def init(): # Setting my schema ... schema_email = Schema( path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT, date=DATETIME, subject=TEXT(stored=True), body=TEXT, ) schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True)) schemas = {"index_emails": schema_email, "index_book": schema_book} if not os.path.exists(index_path): os.mkdir(index_path) indexes = {} for ixname, schema in schemas.items(): """ Esta parte es mejorable, ya que sólo indexa si no existe indice. No tiene en cuenta si los archivos indexados se han modificado o si se han eliminado como se explica aquí: @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing """ exists = index.exists_in(index_path, indexname=ixname) if not exists: ix = index.create_in(index_path, schema, indexname=ixname) # Indexing ... ix = index.open_dir(index_path, indexname=ixname) writer = ix.writer() if ixname == "index_emails": files = read_dir() index_emails(files, writer) elif ixname == "index_book": index_book(writer) else: ix = index.open_dir(index_path, indexname=ixname) indexes[ixname] = ix # Main routine while True: ix = indexes.get("index_emails") with ix.searcher() as searcher: input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ")) mparser = MultifieldParser(["subject", "body"], schema=ix.schema) myquery = mparser.parse(unicode(input_user)) results = searcher.search(myquery) print "==================================================" for result in results: # read_file(result.get("path")) print ("Remitente: " + findNameBySender(indexes, result.get("sender_email"))) print ("Asunto: " + result.get("subject")) print "=================================================="
def search_my_archive(query_str): my_index = open_dir(conf.PATH_INDEX_ARCHIVE) with my_index.searcher() as searcher: mparser = MultifieldParser(['content','retweet'], schema=my_index.schema) query = mparser.parse(query_str) results = searcher.search(query) result_list = [entry['feed_id'] for entry in results] with open(conf.PATH_ARCHIVE_JSON,'r') as f: feeds = json.loads(f.read()) return [feed for feed in feeds if str(feed['id']) in result_list]
def answer_query(query): with main_index.searcher() as searcher: parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2}) parser.add_plugin(FuzzyTermPlugin()) # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match query = parser.parse(unicode(query) + '~/1') results = searcher.search(query, limit=100) tags = [r['tag'] for r in results] return tags
def term_search(self, query): terms = [] if query.get('term'): parser = MultifieldParser(self.term_fields, schema=self.index.schema) terms.append(parser.parse(unicode(query.pop('term')[0]))) for key in query.keys(): terms.append(Or([ Term(key, unicode(t)) for t in query.pop(key) ])) with self.searcher() as searcher: for entry in searcher.search(And(terms), limit=None): yield entry.fields()