def scoring_results(self, input_query, number_of_results): ix = index.open_dir(self.path + '/' + str(self.analyzer)) #check the scoring parameter and set the scoring_function accordingily if self.scoring is 'frequency': scoring_function = scoring.Frequency() elif self.scoring is 'tf_idf': scoring_function = scoring.TF_IDF() elif self.scoring is 'bm25f_1': scoring_function = scoring.BM25F(B=0.35, K1=0.7) elif self.scoring is 'bm25f_2': scoring_function = scoring.BM25F(B=0.75, K1=1.2) elif self.scoring is 'bm25f_3': scoring_function = scoring.BM25F(B=0.75, K1=2.3) elif self.scoring is 'bm25f_4': scoring_function = scoring.BM25F(B=0.9, K1=1.1) else: print('scoring method not found') qp = QueryParser("content", ix.schema) persed_query = qp.parse(input_query) # parsing the query searcher = ix.searcher(weighting=scoring_function) # execute the search results = searcher.search(persed_query, limit=number_of_results) rr = [] rank = 0 # loop over search results for hit in results: rank += 1 rr.append([hit['indx'], rank]) # close searcher searcher.close() # return list of tuples (docID, rank) return (rr)
def createScoreFunction(s): if (s == 1): w = scoring.BM25F(B=0.75, K1=1.5) print "BM25" else: if (s == 2): w = scoring.PL2(c=10.0) print "PL2" else: w = scoring.BM25F(B=0.75, K1=1.5) print "BM25" return w
def test(): queries = load_queries() ix = index.open_dir(index_dir) qp = qparser.QueryParser('content', ix.schema) q = qp.parse("id") with ix.searcher(weighting=scoring.TF_IDF()) as searcher_tfidf: scoring.TFIDF().scorer(searcher_tfidf, 'body', 'algebra').score(q.matcher(searcher_tfidf)) with ix.searcher(weighting=scoring.BM25F()) as searcher_bm25f: scoring.BM25F().scorer(searcher_bm25f, 'body', 'algebra').score(q.matcher(searcher_bm25f))
def search(request): indexNewsObject = IndexNews() ix = indexNewsObject.ix if request.method == 'POST': inputQuery = request.POST['inputQuerySearchPage'] request.session['inputQuery'] = inputQuery if inputQuery == '': context = { 'message' : 'لطفا عبارت مورد نظر خود را وارد کنید' } return render(request,'searchPage/searchPage.html',context=context) else: # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) # queryParser = MultifieldParser(['title','content'],schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context) else: inputQuery = request.session['inputQuery'] # queryParser = QueryParser(fieldname='content',schema=ix.schema,group=OrGroup) queryParser = MultifieldParser(['title','content','summary'],schema=ix.schema) query = queryParser.parse(inputQuery) with ix.searcher(weighting=scoring.BM25F()) as searcher: results = searcher.search(query,terms=True,limit=None) #for customize html tag form highlight matched terms htmlFormat = highlight.HtmlFormatter('b') results.formatter = htmlFormat results.fragmenter.maxchars = 300 results.fragmenter.surround = 150 paginator = Paginator(results,15) page = request.GET.get('page') resultWithPage = paginator.get_page(page) context = { 'results':resultWithPage, 'inputQuery':inputQuery } return render(request,'searchPage/searchPage.html',context=context)
def query(query_phrase): if not os.path.exists(INDEX_DIR): os.mkdir(INDEX_DIR) filter_words = load_all_words() word_list = jieba.cut(query_phrase) query_phrase = " ".join([w for w in word_list \ if w not in filter_words and len(w.strip())>0]) query_phrase = query_phrase.replace(" "," ") print type(query_phrase),query_phrase ix = open_dir(INDEX_DIR) with ix.searcher(weighting=scoring.BM25F(B=0.1)) as searcher: query = QueryParser("content", ix.schema).parse(query_phrase) results = searcher.search(query, limit=150) re_json = [] for e in results: score = float(e.score)*float(e["radio"]) # print e.score,e["radio"] m = e.highlights("content").encode('utf8') re_json.append((score,e["path"],m)) # print '*'*20 print len(re_json) rs = sorted(re_json,key=lambda x:x[0],reverse=True) res = query_output(rs) ix.close() return res
def search_over_index(keywords): keys = nltk.word_tokenize(keywords) manifestos_list = [] party_mentions_keyword = defaultdict(int) top_mentions = defaultdict(int) ix = open_dir("indexdir") with ix.searcher(weighting=scoring.BM25F()) as searcher: for k in keys: qp = QueryParser('text', ix.schema, group=OrGroup).parse(k) results = searcher.search(qp, limit=None) for r in results: # List with all the manifestos containing keywords if r['id'] not in manifestos_list: manifestos_list.append(r['id']) party_mentions_keyword[(r['party'], k)] += 1 top_mentions[k] += 1 print("\nAll the manifestos containing the keywords:") for manifesto in manifestos_list: print(manifesto) print("\nHow many times each party mentions each keyword:") for key, value in party_mentions_keyword.items(): print('{0:30}{1:35}{2:30}'.format(key[1], key[0], value)) manifestos_keywords(manifestos_list) list_mentioned_keywords(top_mentions)
def query_whoosh(whoosh_dir, queries, weighting=scoring.BM25F(), num_results=50): res_sets = [] # Weighting used for ranking documents ix = index.open_dir(whoosh_dir) # Examine effect of scoring on queries for key terms (and key terms themselves) # Highlight search term in results by making them UPPER CASE formatter = UppercaseFormatter() # Run queries and print results for q in queries: # "new york", "empire state building", "oculus", cur = [] with ix.searcher(weighting=weighting) as searcher: query = QueryParser("body", ix.schema).parse(q) results = searcher.search(query, limit=num_results) results.formatter = formatter print_header("Query: {} returned {} results for {}".format( q, len(results), str(weighting))) # if print_results: for i, result in enumerate(results): cur.append(result['url'].replace('index.txt', '')) print_result(i, result) print() res_sets.append(set(cur)) return res_sets
def ranking(topic_id, p, index, model="TF-IDF"): topic_id = int(topic_id) - 101 # Normalize topic identifier to start at 0 if model == "TF-IDF": weighting = scoring.TF_IDF() elif model == "BM25": weighting = scoring.BM25F() else: raise ValueError( "Invalid scoring model: please use 'TF-IDF' or 'BM25'") with open(os.path.join(corpus_dir, "..", "topics.txt")) as f: topics = f.read().split("</top>")[:-1] norm_topics = remove_tags(topics) topic = norm_topics[topic_id] if stemming: analyzer = StemmingAnalyzer() else: analyzer = StandardAnalyzer() tokens = [token.text for token in analyzer(topic)] string_query = ' '.join(tokens) with index.searcher(weighting=weighting) as searcher: q = MultifieldParser( ("date", "headline", "dateline", "byline", "content"), index.schema, group=OrGroup).parse(string_query) results = searcher.search(q, limit=p) return [(r["doc_id"], round(r.score, 4)) for r in results]
def whooshOpen(query): ix = open_dir("../lab3/indexdir") results_dict = {} query = QueryParser('content', ix.schema).parse(query) with ix.searcher(weighting=scoring.TF_IDF()) as s_tf: tf_results = s_tf.search(query, limit=100) for r in tf_results: results_dict.setdefault(r.docnum, []).append(r.score) with ix.searcher(weighting=scoring.BM25F()) as s_bm: bm_results = s_bm.search(query, limit=100) for r in bm_results: results_dict.setdefault(r.docnum, []).append(r.score) ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10) l = [] for (id, vals) in results_dict.iteritems(): if len(vals) == 2: l.append((vals[0], vals[1], ranks[id])) expected = start() ys = [] for (tf, bm, pr) in l: ys.append(bm * expected[0] + tf * expected[1] + pr * expected[2] + expected[3]) print ys
def query_search(indexdir, queries, n=10, function='BM25F'): ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: rates = sorting.FieldFacet('rating', reverse=True) scores = sorting.ScoreFacet() results = s.search(q, limit=n, sortedby=[scores, rates]) k = min(len(results), n) for i in range(k): result_index.append(int(results[i]['ID'])) return result_index
def search(self, user_query, ranking_function=scoring.BM25F(), phraseSearch=False): qp = QueryParser("body", schema=self.ix.schema) # Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object: # default query lang: # If the user doesn’t explicitly specify AND or OR clauses: # by default, the parser treats the words as if they were connected by AND, # meaning all the terms must be present for a document to match # we will change this # to phrase search "<query>" - use quotes qp.add_plugin(qparser.GtLtPlugin) # qp.remove_plugin_class(qparser.PhrasePlugin) qp.add_plugin(qparser.PhrasePlugin) if phraseSearch == True: user_query = '"' + user_query + '"' query = qp.parse(user_query) print("# user_query", user_query, ", Query: ", query) print(query) with self.ix.searcher(weighting=ranking_function) as searcher: matches = searcher.search(query, limit=None) print("Total Number of Results:", len(matches)) print("Number of scored and sorted docs in this Results object:", matches.scored_length()) results = [item.fields() for item in matches] resultsDF = pandas.DataFrame.from_dict(results) return (matches.docs(), resultsDF)
def finalresult(): if request.method == 'POST': #search query query = request.form['QA'] print(query) results = [] ix = index.open_dir("qadata_Index") schema = ix.schema # Create query parser that looks through designated fields in index og = qparser.OrGroup.factory(0.9) mp = qparser.MultifieldParser(['question', 'answer'], schema, group=og) # This is the user query q = mp.parse(request.form['QA']) # Actual searcher, prints top 10 hits with ix.searcher(weighting=scoring.BM25F()) as s: results = s.search(q, limit=5) for i in range(5): print(results[i]['question'], str(results[i].score), results[i]['answer']) return render_template("result.html", searchquery=request.form['QA'], Q1=results[0]['question'], A1=results[0]['answer'], Q2=results[1]['question'], A2=results[1]['answer'], Q3=results[2]['question'], A3=results[2]['answer'], Q4=results[3]['question'], A4=results[3]['answer'], Q5=results[4]['question'], A5=results[4]['answer'])
def create_searcher(self): # 建立搜索对象 ixr = open_dir(BASE_DIR + "\search_engine\index", 'my_indexing') # with ixr.searcher(weighting=scoring.BM25F()) as searcher: self.searcher = ixr.searcher(weighting=scoring.BM25F()) # 没有close,可能会内存泄露,记得关服务器。 # 建立解析器(使用多字段查询解析器) self.parser = MultifieldParser(['title', 'text'], schema=self.schema)
def score_to_file(): # Open index ix = index.open_dir(index_dir) # Use the reader to get statistics reader = ix.reader() queries = load_queries() outfile = open(output_file, "w") with ix.searcher(weighting=scoring.BM25F()) as searcher: # with ix.searcher(weighting=scoring.TF_IDF()) as searcher: qp = qparser.QueryParser(field, schema=ix.schema) # qp = qparser.MultifieldParser(fields, schema=ix.schema) for query in queries: print("Processing query number", query['id']) # Retrieve documents using the vector space model q = qp.parse(query['text']) # we contatenate query terms res = searcher.search(q) # res = get_score(searcher, qp, query['text']) for r in res: outfile.write(query['id'] + " Q0 " + r['id'] + " " + str(r.score) + "\n") # Output max 50 results # for docnum in sorted(res, key=res.get, reverse=True)[:50]: # # Look up our docID # stored = reader.stored_fields(docnum) # # Write `docID Q0 queryID score` into output file # outfile.write(query['id']+ " Q0 " + stored['id'] + " " + str(res[docnum]) + "\n") outfile.close() ix.close()
def ranking(topic_id, p, index, model="TF-IDF"): topic_id = int(topic_id) - 101 # Correct topic identifier to start at 0 if model == "TF-IDF": weighting = scoring.TF_IDF() elif model == "TF": weighting = scoring.Frequency() elif model == "BM25": weighting = scoring.BM25F() else: raise ValueError( "Invalid scoring model: please use 'TF', 'TF-IDF' or 'BM25'") topic = process_topic(topic_id, topic_directory)[1] if stemming: analyzer = StemmingAnalyzer(stoplist=set(stopwords.words("english"))) else: analyzer = StandardAnalyzer(stoplist=set(stopwords.words("english"))) tokens = [token.text for token in analyzer(topic)] string_query = ' '.join(tokens) with index.searcher(weighting=weighting) as searcher: q = QueryParser("content", index.schema, group=OrGroup).parse(string_query) results = searcher.search(q, limit=p) return [(r["doc_id"], round(r.score, 4)) for r in results]
def conduct_search(query_str): ''' Conducts search over indexed documents using a user-provided query. Args: query_str (string): The query used for the search Returns: results_list: A ranked list of 3 tuples containing highest scoring question and answer data ''' #number of search results returned to user NUM_OF_RESULTS_SHOWN = 3 #open the index directory ix = open_dir("indexdir") #conduct index search with ix.searcher(weighting=scoring.BM25F()) as searcher: query = MultifieldParser(["title", "content"], ix.schema, group=OrGroup).parse(query_str) results = searcher.search(query, limit=NUM_OF_RESULTS_SHOWN, terms=True) if NUM_OF_RESULTS_SHOWN < len(results): results_list = [(results[num]["title"], results[num]["content"]) for num in range(NUM_OF_RESULTS_SHOWN)] else: results_list = [(results[num]["title"], results[num]["content"]) for num in range(len(results))] return results_list
def Search(query, ix): with ix.searcher(weighting=scoring.BM25F()) as searcher: parser = QueryParser("post", schema=ix.schema, group=OrGroup) seg_list = jieba.cut_for_search(query) querystring = " ".join(seg_list) print(querystring) query = parser.parse(querystring) results = searcher.search(query, limit=20) if len(results) == 0: answer = "啊?我不太明白,换一种问法看看?" else: candidates = [] for result in results: candidate = { 'post': result['post'], 'reply': result['reply'].replace(" ", ""), 'score': result.score, 'postemood': result['postemood'] } print(candidate) candidates.append(candidate) print(len(candidates)) bestresult = Rank(candidates, querystring) answer = bestresult['reply'] print(answer) return answer
def SearchForTest(querypair, ix): with ix.searcher(weighting=scoring.BM25F()) as searcher: query = querypair[0] emotion = querypair[1] parser = QueryParser("post", schema=ix.schema, group=OrGroup) seg_list = jieba.cut_for_search(query) querystring = " ".join(seg_list) print(querystring) query = parser.parse(querystring) results = searcher.search(query, limit=20) if len(results) == 0: answer = "" else: candidates = [] for result in results: candidate = { 'post': result['post'], 'reply': result['reply'], 'score': result.score, 'postemood': result['postemood'] } print(candidate) candidates.append(candidate) print(len(candidates)) bestresult = RankForTest(candidates, emotion) answer = bestresult['reply'] print(answer) return answer
def search(self, trans, search_term, page, page_size, boosts): """ Perform the search on the given search_term :param search_term: unicode encoded string with the search term(s) :returns results: dictionary containing number of hits, hits themselves and matched terms for each """ tool_index_dir = os.path.join(trans.app.config.whoosh_index_dir, 'tools') index_exists = whoosh.index.exists_in(tool_index_dir) if index_exists: index = whoosh.index.open_dir(tool_index_dir) try: # Some literature about BM25F: # http://trec.nist.gov/pubs/trec13/papers/microsoft-cambridge.web.hard.pdf # http://en.wikipedia.org/wiki/Okapi_BM25 # __Basically__ the higher number the bigger weight. tool_weighting = scoring.BM25F(field_B={ 'name_B' : boosts.tool_name_boost, 'description_B' : boosts.tool_description_boost, 'help_B' : boosts.tool_help_boost, 'repo_owner_username_B' : boosts.tool_repo_owner_username_boost}) searcher = index.searcher(weighting=tool_weighting) parser = MultifieldParser([ 'name', 'description', 'help', 'repo_owner_username'], schema=schema) user_query = parser.parse('*' + search_term + '*') try: hits = searcher.search_page(user_query, page, pagelen=page_size, terms=True) except ValueError: raise ObjectNotFound('The requested page does not exist.') log.debug('searching tools for: #' + str(search_term)) log.debug('total hits: ' + str(len(hits))) log.debug('scored hits: ' + str(hits.scored_length())) results = {} results['total_results'] = str(len(hits)) results['page'] = str(page) results['page_size'] = str(page_size) results['hits'] = [] for hit in hits: hit_dict = {} hit_dict['id'] = hit.get('id') hit_dict['repo_owner_username'] = hit.get('repo_owner_username') hit_dict['repo_name'] = hit.get('repo_name') hit_dict['name'] = hit.get('name') hit_dict['description'] = hit.get('description') matched_terms = {k: unicodify(v) for k, v in hit.matched_terms()} results['hits'].append({'tool': hit_dict, 'matched_terms': matched_terms, 'score': hit.score}) return results finally: searcher.close() else: raise exceptions.InternalServerError('The search index file is missing.')
def make_search_service(search_text): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir') myindex = open_dir(index_path) qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8}) qstring = search_text q = qp.parse(qstring) results_list = [] myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0)) with myindex.searcher(weighting=myWeighting) as s: results = s.search(q, limit=30, terms=True) #forse cercavi e risultati relativi a corrected = s.correct_query(q, qstring) did_you_mean = str result_for = str if corrected.query != q: if len(results) < 1: results = s.search(qp.parse(corrected.string), limit=30, terms=True) result_for = corrected.string else: did_you_mean = corrected.string #query expansion keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)] if not keywords and keywords == " ": query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords)) results_keyword = s.search(query_keyword, limit=30, terms=True) results.upgrade_and_extend(results_keyword) #sorting key_sort = lambda result: result.score results = sorted(results, key=key_sort, reverse=True) for ris in results: result = {} result['title'] = ris['title'] result['url'] = ris['url'] result['id'] = ris['ID'] result['highlight'] = ris.highlights("textdata") results_list.append(result) #per calcolo precisione e recall id_results = [ris['id'] for ris in results_list[:10]] return { 'search_text': search_text, 'results': results_list, 'did_you_mean': did_you_mean, 'result_for': result_for, 'results_ids': id_results }
def query_evaluator(file_directory,query_tsv_name,result_limit, *notitle): ''' This function is used to evaluate all queries by the 24 different Search Engine. :param file_directory: directory where the indexes are stored ("C:./Cranfield_DATASET/" and "C:./Time_DATASET/" ) :param query_tsv_name: Queries.tsv names (cran_Queries.tsv, time_Queries) :param result_limit: the number of top k results retrieved :param notitle: if notitle, it means only the body is considered (this is used for Time.csv dataset) :return: SEs = that is a dictionary where the keys are the Search Engines (ex. SE_01, SE_02, ..., SE_24) and the values their query document results ''' SEs = defaultdict(list) # dictionary where all the SEs will be stored query_path = file_directory+query_tsv_name # query_path = path where there are the queries ["C:./Cranfield_DATASET/cran_Queries.tsv", "C:./Time_DATASET/time_Queries.tsv"] analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names counter = 1 # counter used to name the SEs for analyzer in analyzer_names: index_directory = file_directory+'inverted_index_'+analyzer #get the directory where the index is stored ix = index.open_dir(index_directory) # open the index inside the chosen directory scoring_functions = [scoring.TF_IDF(),scoring.Frequency(),scoring.BM25F(B=0.75,K1=1.2)] # list of chosen scoring functions # per each index three different scoring functions are used: for score in scoring_functions: scoring_function = score # select the scoring function if notitle: #this is fot Time dataset because only the body will be considered # query parser qp = QueryParser("body", ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field. else: # this is for the Cranfield dataset because both title and body will be considered # query parser qp = MultifieldParser(["title","body"], ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field. # Create the searcher for the index based on the predefined scoring function searcher = ix.searcher(weighting=scoring_function) with open(query_path) as tsvfile: # here the .tsv containing the query is used and one by one are parsed querys = csv.reader(tsvfile, delimiter='\t') header = next(querys) # check if there is the header if header != None: # if there is the header iterate over all the rows in the Query.tsv file (cran_Queries.tsv, time_Queries) for query in querys: parsed_query = qp.parse(query[1]) # parsing the query (because up to now, the query is just a python string, and it has to be interpreted by the program. Because up to now it's just a boolean operator) results = searcher.search(parsed_query, limit=result_limit) # here the query is performed and only the top "result_limit" will be considered for hit in results: ''' here the relevant results will be selected. In particular: Query number, Doc ID, Rank and Score ''' output = [query[0],hit['id'], str(hit.rank + 1), str(hit.score)] SEs['SE_'+str(counter)].append(output) # the results are added to the predefined dictionary print('analyzer: '+analyzer, 'scoring_function: '+str(scoring_function).split('.')[2].split(' ')[0], '('+str(counter)+')') counter +=1 return(SEs)
def generate_score(self, query, measure, k=None): '''Generate scores for a given query according to a given measure''' if (measure == 'bm25'): score = self.rank(query, weighting=scoring.BM25F(), k=k) elif (measure == 'cos'): score = self.rank(query, weighting=scoring.TF_IDF(), k=k) elif (measure == 'freq'): score = self.rank(query, weighting=scoring.Frequency(), k=k) return score
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'): prediction = user_cf(E, user_id, 3) ix = index.open_dir(indexdir) search_fields = ['resname', 'categories', 'address', 'city', 'state'] # search fields og = qparser.OrGroup.factory(0.9) qp = MultifieldParser(search_fields, ix.schema, termclass=query.Variations, group=og) qp.add_plugin(DateParserPlugin(free=True)) q = qp.parse(queries) result_index = [] if function == 'BM25F': # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s: # add weight for the resname and the categories_B with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=None, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / (relevance.max() - relevance.min()) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) if function == 'TF_IDF': with ix.searcher(weighting=scoring.TF_IDF()) as s: scores = sorting.ScoreFacet() results = s.search(q, limit=m, sortedby=[scores]) m = len(results) if m != 0: relevance = np.zeros(m) expected = np.zeros(m) for i in range(m): relevance[i] = -results[i].score relevance = (relevance - relevance.min()) / ( relevance.max() - relevance.min() ) # normalized score from 0 to 1 for i in range(m): expected[i] = relevance[i] * prediction[int( results[i]['ID'])] indorder = np.argsort(expected) k = min(m, n) for i in range(k): result_index.append(int(results[indorder[-1 - i]]['ID'])) return result_index
def get_scoring(scoring_measure): foo = scoring.Frequency() if scoring_measure == "TF_IDF": foo = scoring.TF_IDF() if scoring_measure == "BM_25": foo = scoring.BM25F() if scoring_measure == "TF": foo = scoring.Frequency() return foo
def search(query): my_index = index.open_dir("my_index") qp = QueryParser("content", schema=my_index.schema) result = {} q = qp.parse(query) with my_index.searcher(weighting=scoring.BM25F()) as s: results = s.search(q, terms=True, limit=20) for r in results: result[r.values()[0]] = round(r.score) return result
def create_scoring_function(s): """ Returns a scoring function object, depending of the value of s. """ if s == 2: w = scoring.PL2(c=10.0) else: w = scoring.BM25F(B=0.75, K1=1.5) return w
def eval_get_ranked_set_baseline(self, basefile): # Step 1: Read the saved keyterms for a subset of articles # (created by analyze_baseline_queries) g = Graph() g.parse(self.generic_path("keyterms", "analyzed", ".n3"), format="n3") articles = {} for (s, p, o) in g: if not str(s) in articles: articles[str(s)] = [] articles[str(s)].append(str(o)) # Step 2: Open the large whoosh index containing the text of # all cases. Then, create a query for each article based on # the keyterms. connector = query.Or indexdir = os.path.sep.join([self.config.datadir, 'ecj', 'index']) storage = FileStorage(indexdir) idx = storage.open_index() searcher = idx.searcher(weighting=scoring.BM25F()) res = {} # for article in sorted(articles.keys()): for article in self._articles(basefile): terms = articles[article] rankedset = [] #parser = qparser.QueryParser("content", idx.schema) #q = parser.parse(connector.join(terms)) q = query.And([ # query.Term("articles", article), connector([query.Term("content", x) for x in terms]) ]) # print q # self.log.debug("Article %s: %s", article, " or ".join(terms)) results = searcher.search(q, limit=None) resultidx = 0 # self.log.info("Keyterms for result: %r" % results.key_terms("content", docs=10, numterms=10)) for result in results: reslbl = "%s (%s)" % (result['basefile'], results.score(resultidx)) rankedset.append([result['basefile'], reslbl]) # self.log.debug(u"\t%s: %2.2d" % (result['title'], results.score(resultidx))) resultidx += 1 self.log.info( "Created baseline ranked set for %s: Top result %s (of %s)" % (article.split("/")[-1], rankedset[0][0], len(rankedset))) # return just a list of URIs, no scoring information. But the # full URI isnt available in the whoosh db, so we recreate it. res[article] = [ "http://lagen.nu/ext/celex/%s" % x[0] for x in rankedset ] return res
def search(self, query): ix = open_dir("news") if "s" in query.keys(): s = query["s"][0] else: s = "" if "w" in query.keys(): if query["w"] == "c": w = scoring.TF_IDF() elif query["w"] == "b": w = scoring.BM25F() else: w = TimeWeight() else: w = OurWeight() ret = {"r": [], "s": {}} with ix.searcher() as searcher: parser = MultifieldParser(["t", "d"], ix.schema, group=OrGroup).parse(unicode(s, "UTF-8")) results = searcher.search(parser, limit=100) for r in results: ret["r"].append({ "t": r["t"], "d": r["d"], "p": r["time"], "l": r["link"], "e": r["tags"], "r": r["tags"] }) corrector = searcher.corrector("d") for m in s.split(): sug = corrector.suggest(m, limit=3) for s in sug: if m not in ret["s"].keys(): ret["s"][m] = [] ret["s"][m].append(s) print ret["s"] f = StringIO() f.write(json.dumps(ret, indent=4, separators=(',', ': '))) length = f.tell() f.seek(0) self.send_response(200) encoding = sys.getfilesystemencoding() self.send_header("Content-type", "text/html; charset=%s" % encoding) self.send_header("Content-Length", str(length)) self.end_headers() return f
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
def search(humanReadableId): query = request.args.get('q', '').strip() pagination = None if query: index_base_dir = config().get_path("ZIM", "wikipedia_index_dir") index_dir = os.path.join(index_base_dir, humanReadableId) page = int(request.args.get('page', 1)) # Load index so we can query it for which fields exist ix = whoosh_open_dir_32_or_64(index_dir) # Set a higher value for the title field so it is weighted more weighting = scoring.BM25F(title_B=1.0) # Sort pages with "Image:" in their title after # regular articles def image_pages_last(searcher, docnum): fields = searcher.stored_fields(docnum) if fields['title'].find("Image:") == 0: return 1 else: return 0 # Support older whoosh indexes that do not have a reverse_links field if 'reverse_links' in ix.schema.names(): sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), sorting.FieldFacet("reverse_links", reverse=True), ]) else: sortedby = sorting.MultiFacet([ sorting.FunctionFacet(image_pages_last), sorting.ScoreFacet(), ]) (pagination, suggestion) = paginated_search(ix, ["title", "content"], query, page, weighting=weighting, sort_column=sortedby) else: flash(_('Please input keyword(s)'), 'error') return render_template('zim/search.html', humanReadableId=humanReadableId, pagination=pagination, suggestion=suggestion, keywords=query, endpoint_desc=EndPointDescription( 'zim_views.search', {'humanReadableId': humanReadableId}))