def busquedaPorAsuntoCuerpo(aBuscar): ix = open_dir("index") qp = MultifieldParser(["asunto","cuerpo"], schema=ix.schema) q = qp.parse(unicode(str(aBuscar))) s=ix.searcher() results = s.search(q) return results
class EventSearcher(object): def search(self, query): data = None try: q = self.parser.parse(query) result = self.searcher.search(q) data = [{ "title": res["title"], "description": res["description"], "id": res["id"], "date": res['date'] } for res in result] finally: # better to close on __del__? self.searcher.close() return data def __init__(self): index = open_dir("search/index") self.searcher = index.searcher() self.parser = MultifieldParser(["title", "description", "date"], index.schema) self.parser.add_plugin(DateParserPlugin())
class EventSearcher(object): def search(self, query): data = None try: q = self.parser.parse(query) result = self.searcher.search(q) data = [ {"title": res["title"], "description": res["description"], "id": res["id"], "date": res['date'] } for res in result ] finally: # better to close on __del__? self.searcher.close() return data def __init__(self): index = open_dir("search/index") self.searcher = index.searcher() self.parser = MultifieldParser(["title", "description", "date"], index.schema) self.parser.add_plugin(DateParserPlugin())
def search(self, query): qp = MultifieldParser(['title', 'content'], schema=self.idx.schema) q = qp.parse(query) query_result = self.trainModel.predict(query) query_label = query_result['outputLabel'] query_output_list = query_result['outputMulti'] query_score = float() for output in query_output_list: if output['label'] == query_label: query_score = float(output['score']) #print("Query result : " + query_label + "\nQuery score : " + str(query_score)) with self.idx.searcher() as searcher: search_results = searcher.search(q, limit=None) article_ids = self.find_matching_political_bias\ (search_results, query_label, query_score) for article_id in article_ids: #print(article_id) print(self.get_article_url(article_id)) print(self.get_article_title(article_id)) print(self.get_article_snippet(article_id)) print(self.get_article_date(article_id))
def search(self, str): ix = open_dir(indexdir) with ix.searcher() as searcher: parser = MultifieldParser(['keywords', 'content', 'title'], ix.schema) query = parser.parse(str) results = searcher.search(query) return [list(result.values()) for result in results]
def buildQueryParser(self): # Set numerical scoring parameters CoordinateDescentScorer.baselineScoreWeight = self.baselineScoreWeight CoordinateDescentScorer.pageRankWeight = self.pageRankWeight CoordinateDescentScorer.pageRankScalingWeight = self.pageRankScalingWeight # Build parser keywordsQueryParser = MultifieldParser(['content','title', 'description', 'keywords', 'headers', 'yqlKeywords', 'expandedYqlKeywords'], self.indexSchema, fieldboosts=self.weights, group=OrGroup) keywordsQueryParser.add_plugin(PlusMinusPlugin) return keywordsQueryParser
def apartado_a(palabras): ix = open_dir("Index") queryContent = [] for palabra in palabras: queryContent.append(Term("")) query = palabras with ix.searcher() as searcher: parser = MultifieldParser(["titulo", "sinopsis"], ix.schema, group=qparser.OrGroup) query = parser.parse(query) print(query) results = searcher.search(query) for r in results: print(r)
def search(): start_time = time.time() form = request.form qstr = request.args.get('q') page = int(request.args.get('p', "1")) parser = MultifieldParser(['title', 'content'], schema=ix.schema, group=OrGroup) #termclass=MyFuzzyTerm) query = parser.parse(qstr) notes = [] with ix.searcher() as searcher: corrected = searcher.correct_query(query, qstr) results = searcher.search(query) rel = results.estimated_length() if corrected.string.lower() != qstr.lower(): crs = searcher.search(corrected.query) if crs.estimated_length() > rel: notes.append("Did you mean: " + corrected.string) results = searcher.search_page(query, page, terms=True) my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256) results.order = SCORE results.fragmenter = my_cf # results.formatter = HtmlFormatter() rsp = [{ "url": item["url"], "content": item.highlights("content"), "title": item["title"] } for item in results] # return json.dumps(rsp) # print(json.dumps(rsp)) if rel == 0: notes.append("Sorry, no result for your query") else: elapsed_time = time.time() - start_time notes.append("%d results found in %.2f seconds" % (rel, elapsed_time)) return render_template("result.html", result=rsp, query=qstr, notes=notes, nextpage=page + 1, urlquery=urllib.quote_plus(qstr))
def search(self, query): ix = open_dir("news") if "s" in query.keys(): s = query["s"][0] else: s = "" if "w" in query.keys(): if query["w"] == "c": w = scoring.TF_IDF() elif query["w"] == "b": w = scoring.BM25F() else: w = TimeWeight() else: w = OurWeight() ret = {"r": [], "s": {}} with ix.searcher() as searcher: parser = MultifieldParser(["t", "d"], ix.schema, group=OrGroup).parse(unicode(s, "UTF-8")) results = searcher.search(parser, limit=100) for r in results: ret["r"].append({ "t": r["t"], "d": r["d"], "p": r["time"], "l": r["link"], "e": r["tags"], "r": r["tags"] }) corrector = searcher.corrector("d") for m in s.split(): sug = corrector.suggest(m, limit=3) for s in sug: if m not in ret["s"].keys(): ret["s"][m] = [] ret["s"][m].append(s) print ret["s"] f = StringIO() f.write(json.dumps(ret, indent=4, separators=(',', ': '))) length = f.tell() f.seek(0) self.send_response(200) encoding = sys.getfilesystemencoding() self.send_header("Content-type", "text/html; charset=%s" % encoding) self.send_header("Content-Length", str(length)) self.end_headers() return f
def buildQueryParser(self): headersQueryParser = MultifieldParser(['content', 'headers'], schema=self.indexSchema, group=OrGroup) headersQueryParser.add_plugin(PlusMinusPlugin) return headersQueryParser
def buildQueryParser(self): expandedYahooKeywordsQueryParser = MultifieldParser(['content', 'expandedYqlKeywords'], schema=self.indexSchema, group=OrGroup) expandedYahooKeywordsQueryParser.add_plugin(PlusMinusPlugin) return expandedYahooKeywordsQueryParser
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = { 'content': 1.0, 'title': 3.0 } query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def page_detail(request, pk): page = Page.objects.get(pk=pk) menu = page.menu comments = Comment.objects.filter(page=page).filter( visible=True).select_related('user') if menu.get_children(): menu = menu.get_children()[0] menus = {'level2': [], 'level3': []} if menu.level == 2: for m in menu.get_siblings(include_self=True): if m == menu: m.active = True menus['level2'].append(m) menus['level3'] = menu.get_children() elif menu.level == 3: for m in menu.parent.get_siblings(include_self=True): if m == menu.parent: m.active = True menus['level2'].append(m) for m in menu.get_siblings(include_self=True): if m == menu: m.active = True menus['level3'].append(m) if page.keywords: # Open index dir ix = whoosh_index.open_dir(settings.PAGE_SEARCH_INDEX) # Make parser parser = MultifieldParser(['content', 'name'], schema=ix.schema) # Configure filter filter = Term('region', page.region.id) # Make query string qstr = page.keywords.replace('+', ' AND ').replace(' -', ' NOT ').replace(' | ', ' OR ') # Parse query string query = parser.parse(qstr) # And... search in index! with ix.searcher() as searcher: hits = ix.searcher().search(query, filter=filter, limit=None) similar_pages = Page.objects.filter( pk__in=[h.get('id') for h in hits if h.get('id') != page.id] ).order_by('?')[:5] context = { 'page': page, 'comments': comments, 'menus': menus, 'menu': menu, 'similar_pages': locals().get('similar_pages') } # Comments if request.method == 'POST' and request.POST.get('submit_comment'): comment_form = CommentForm(request.POST) if comment_form.is_valid() and request.user.is_authenticated(): comment_form.add_comment() context['comment_success'] = True # Clear form initial = {'user': request.user.id, 'page': page.id} comment_form = CommentForm(initial=initial) else: initial = {'user': request.user.id, 'page': page.id} comment_form = CommentForm(initial=initial) try: context['star_rating'] = StarRating.objects.get(page=page) except StarRating.DoesNotExist: context['star_rating'] = StarRating.objects.create(page=page) context['comment_form'] = comment_form return render(request, 'page_detail.html', context)
def buildQueryParser(self): descriptionQueryParser = MultifieldParser(['content', 'description'], schema=self.indexSchema, group=OrGroup) descriptionQueryParser.add_plugin(PlusMinusPlugin) return descriptionQueryParser
def buildQueryParser(self): keywordsQueryParser = MultifieldParser(["content", "keywords"], schema=self.indexSchema, group=OrGroup) keywordsQueryParser.add_plugin(PlusMinusPlugin) return keywordsQueryParser
def buildQueryParser(self): titleQueryParser = MultifieldParser(['content', 'title'], schema=self.indexSchema, group=OrGroup) titleQueryParser.add_plugin(PlusMinusPlugin) return titleQueryParser
def __init__(self): index = open_dir("search/index") self.searcher = index.searcher() self.parser = MultifieldParser(["title", "description", "date"], index.schema) self.parser.add_plugin(DateParserPlugin())
def search_whoosh_files(filename_in): ngt1 = RegexTokenizer() | NgramFilter(4) l_aux_i = 0 filename_aux = "dataset_match_" + filename_in ix1 = open_dir("index_" + filename_in) #aux max val for progress bar if filename_in == "jrc_person": max_val = 3000000 else: max_val = 3000000 widgets = [ 'Progress Searching ' + filename_in + ': ', Percentage(), ' ', Bar(marker='0', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=max_val) #454000 pbar.start() with ix1.searcher() as searcher: parser = MultifieldParser(['title'], ix1.schema) parser.remove_plugin_class(qparser.WildcardPlugin) parser.remove_plugin_class(qparser.PlusMinusPlugin) with open("dataset_non_match_" + filename_in + ".csv_tmp", 'w', encoding="utf-8") as inW2: with open("dataset_match_" + filename_in + ".csv", encoding="utf8") as csvfile: for row in csvfile: l_aux_i = l_aux_i + 1 if l_aux_i % 20000 == 0: print("Index search" + str(l_aux_i)) pbar.update(l_aux_i) l_row_idx = row.split('|')[0] l_row_aux = row.split('|')[1] search_list = [token.text for token in ngt1(l_row_aux)] if len(search_list) > 0: l_row_str = random.sample(search_list, 1) query = parser.parse(l_row_str[0]) results = searcher.search(query) results_aux = [] for result in results: if result['id'] != l_row_idx: results_aux.append( [result['id'], result['title']]) if len(results_aux) > 0: shuffle(results_aux) line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 0][0] + "|" + results_aux[0][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 1: if results_aux[1][0] != results_aux[0][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 1][0] + "|" + results_aux[1][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 2: if results_aux[2][0] != results_aux[1][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 2][0] + "|" + results_aux[2][1] inW2.write(line_new.strip() + '\n') pbar.finish()