def busquedaPorAsuntoCuerpo(aBuscar): ix = open_dir("index") qp = MultifieldParser(["asunto","cuerpo"], schema=ix.schema) q = qp.parse(unicode(str(aBuscar))) s=ix.searcher() results = s.search(q) return results
def search(self, query): qp = MultifieldParser(['title', 'content'], schema=self.idx.schema) q = qp.parse(query) query_result = self.trainModel.predict(query) query_label = query_result['outputLabel'] query_output_list = query_result['outputMulti'] query_score = float() for output in query_output_list: if output['label'] == query_label: query_score = float(output['score']) #print("Query result : " + query_label + "\nQuery score : " + str(query_score)) with self.idx.searcher() as searcher: search_results = searcher.search(q, limit=None) article_ids = self.find_matching_political_bias\ (search_results, query_label, query_score) for article_id in article_ids: #print(article_id) print(self.get_article_url(article_id)) print(self.get_article_title(article_id)) print(self.get_article_snippet(article_id)) print(self.get_article_date(article_id))
def search(self, str): ix = open_dir(indexdir) with ix.searcher() as searcher: parser = MultifieldParser(['keywords', 'content', 'title'], ix.schema) query = parser.parse(str) results = searcher.search(query) return [list(result.values()) for result in results]
def search(self, query): ix = open_dir("news") if "s" in query.keys(): s = query["s"][0] else: s = "" if "w" in query.keys(): if query["w"] == "c": w = scoring.TF_IDF() elif query["w"] == "b": w = scoring.BM25F() else: w = TimeWeight() else: w = OurWeight() ret = {"r": [], "s": {}} with ix.searcher() as searcher: parser = MultifieldParser(["t", "d"], ix.schema, group=OrGroup).parse(unicode(s, "UTF-8")) results = searcher.search(parser, limit=100) for r in results: ret["r"].append({ "t": r["t"], "d": r["d"], "p": r["time"], "l": r["link"], "e": r["tags"], "r": r["tags"] }) corrector = searcher.corrector("d") for m in s.split(): sug = corrector.suggest(m, limit=3) for s in sug: if m not in ret["s"].keys(): ret["s"][m] = [] ret["s"][m].append(s) print ret["s"] f = StringIO() f.write(json.dumps(ret, indent=4, separators=(',', ': '))) length = f.tell() f.seek(0) self.send_response(200) encoding = sys.getfilesystemencoding() self.send_header("Content-type", "text/html; charset=%s" % encoding) self.send_header("Content-Length", str(length)) self.end_headers() return f
def apartado_a(palabras): ix = open_dir("Index") queryContent = [] for palabra in palabras: queryContent.append(Term("")) query = palabras with ix.searcher() as searcher: parser = MultifieldParser(["titulo", "sinopsis"], ix.schema, group=qparser.OrGroup) query = parser.parse(query) print(query) results = searcher.search(query) for r in results: print(r)
def search(): start_time = time.time() form = request.form qstr = request.args.get('q') page = int(request.args.get('p', "1")) parser = MultifieldParser(['title', 'content'], schema=ix.schema, group=OrGroup) #termclass=MyFuzzyTerm) query = parser.parse(qstr) notes = [] with ix.searcher() as searcher: corrected = searcher.correct_query(query, qstr) results = searcher.search(query) rel = results.estimated_length() if corrected.string.lower() != qstr.lower(): crs = searcher.search(corrected.query) if crs.estimated_length() > rel: notes.append("Did you mean: " + corrected.string) results = searcher.search_page(query, page, terms=True) my_cf = ContextFragmenter(maxchars=20, surround=30, charlimit=256) results.order = SCORE results.fragmenter = my_cf # results.formatter = HtmlFormatter() rsp = [{ "url": item["url"], "content": item.highlights("content"), "title": item["title"] } for item in results] # return json.dumps(rsp) # print(json.dumps(rsp)) if rel == 0: notes.append("Sorry, no result for your query") else: elapsed_time = time.time() - start_time notes.append("%d results found in %.2f seconds" % (rel, elapsed_time)) return render_template("result.html", result=rsp, query=qstr, notes=notes, nextpage=page + 1, urlquery=urllib.quote_plus(qstr))
def run_query(query, index): """ Queries the index for data with the given text query @param query The text query to perform on the indexed data @return A list of HTMl string snippets to return """ # Create a searcher object for this index searcher = index.searcher() # Create a query parser that will parse multiple fields of the documents field_boosts = {'content': 1.0, 'title': 3.0} query_parser = MultifieldParser(['content', 'title'], schema=index_schema, fieldboosts=field_boosts, group=OrGroup) # Build a query object from the query string query_object = query_parser.parse(query) # Build a spell checker in this index and add the "content" field to the spell checker spell_checker = SpellChecker(index.storage) spell_checker.add_field(index, 'content') spell_checker.add_field(index, 'title') # Extract the 'terms' that were found in the query string. This data can be used for highlighting the results search_terms = [text for fieldname, text in query_object.all_terms()] # Remove terms that are too short for search_term in search_terms: if len(search_term) <= 3: search_terms.remove(search_term) # Perform the query itself search_results = searcher.search(query_object) # Get an analyzer for analyzing the content of each page for highlighting analyzer = index_schema['content'].format.analyzer # Build the fragmenter object, which will automatically split up excerpts. This fragmenter will split up excerpts # by 'context' in the content fragmenter = ContextFragmenter(frozenset(search_terms)) # Build the formatter, which will dictate how to highlight the excerpts. In this case, we want to use HTML to # highlight the results formatter = HtmlFormatter() # Iterate through the search results, highlighting and counting the results result_count = 0 results = [] for search_result in search_results: # Collect this search result results.append({ 'content': highlight(search_result['content'], search_terms, analyzer, fragmenter, formatter), 'url': search_result['url'], 'title': search_result['title'] }) result_count += 1 # Build a list of 'suggest' words using the spell checker suggestions = [] for term in search_terms: suggestions.append(spell_checker.suggest(term)) # Return the list of web pages along with the terms used in the search return results, search_terms, suggestions, result_count
def search_whoosh_files(filename_in): ngt1 = RegexTokenizer() | NgramFilter(4) l_aux_i = 0 filename_aux = "dataset_match_" + filename_in ix1 = open_dir("index_" + filename_in) #aux max val for progress bar if filename_in == "jrc_person": max_val = 3000000 else: max_val = 3000000 widgets = [ 'Progress Searching ' + filename_in + ': ', Percentage(), ' ', Bar(marker='0', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=max_val) #454000 pbar.start() with ix1.searcher() as searcher: parser = MultifieldParser(['title'], ix1.schema) parser.remove_plugin_class(qparser.WildcardPlugin) parser.remove_plugin_class(qparser.PlusMinusPlugin) with open("dataset_non_match_" + filename_in + ".csv_tmp", 'w', encoding="utf-8") as inW2: with open("dataset_match_" + filename_in + ".csv", encoding="utf8") as csvfile: for row in csvfile: l_aux_i = l_aux_i + 1 if l_aux_i % 20000 == 0: print("Index search" + str(l_aux_i)) pbar.update(l_aux_i) l_row_idx = row.split('|')[0] l_row_aux = row.split('|')[1] search_list = [token.text for token in ngt1(l_row_aux)] if len(search_list) > 0: l_row_str = random.sample(search_list, 1) query = parser.parse(l_row_str[0]) results = searcher.search(query) results_aux = [] for result in results: if result['id'] != l_row_idx: results_aux.append( [result['id'], result['title']]) if len(results_aux) > 0: shuffle(results_aux) line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 0][0] + "|" + results_aux[0][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 1: if results_aux[1][0] != results_aux[0][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 1][0] + "|" + results_aux[1][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 2: if results_aux[2][0] != results_aux[1][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 2][0] + "|" + results_aux[2][1] inW2.write(line_new.strip() + '\n') pbar.finish()
def __init__(self): index = open_dir("search/index") self.searcher = index.searcher() self.parser = MultifieldParser(["title", "description", "date"], index.schema) self.parser.add_plugin(DateParserPlugin())