def get_scoring(scoring_measure): foo = scoring.Frequency() if scoring_measure == "TF_IDF": foo = scoring.TF_IDF() if scoring_measure == "BM_25": foo = scoring.BM25F() if scoring_measure == "TF": foo = scoring.Frequency() return foo
def scoring_results(self, input_query, number_of_results): ix = index.open_dir(self.path + '/' + str(self.analyzer)) #check the scoring parameter and set the scoring_function accordingily if self.scoring is 'frequency': scoring_function = scoring.Frequency() elif self.scoring is 'tf_idf': scoring_function = scoring.TF_IDF() elif self.scoring is 'bm25f_1': scoring_function = scoring.BM25F(B=0.35, K1=0.7) elif self.scoring is 'bm25f_2': scoring_function = scoring.BM25F(B=0.75, K1=1.2) elif self.scoring is 'bm25f_3': scoring_function = scoring.BM25F(B=0.75, K1=2.3) elif self.scoring is 'bm25f_4': scoring_function = scoring.BM25F(B=0.9, K1=1.1) else: print('scoring method not found') qp = QueryParser("content", ix.schema) persed_query = qp.parse(input_query) # parsing the query searcher = ix.searcher(weighting=scoring_function) # execute the search results = searcher.search(persed_query, limit=number_of_results) rr = [] rank = 0 # loop over search results for hit in results: rank += 1 rr.append([hit['indx'], rank]) # close searcher searcher.close() # return list of tuples (docID, rank) return (rr)
def ranking(topic_id, p, index, model="TF-IDF"): topic_id = int(topic_id) - 101 # Correct topic identifier to start at 0 if model == "TF-IDF": weighting = scoring.TF_IDF() elif model == "TF": weighting = scoring.Frequency() elif model == "BM25": weighting = scoring.BM25F() else: raise ValueError( "Invalid scoring model: please use 'TF', 'TF-IDF' or 'BM25'") topic = process_topic(topic_id, topic_directory)[1] if stemming: analyzer = StemmingAnalyzer(stoplist=set(stopwords.words("english"))) else: analyzer = StandardAnalyzer(stoplist=set(stopwords.words("english"))) tokens = [token.text for token in analyzer(topic)] string_query = ' '.join(tokens) with index.searcher(weighting=weighting) as searcher: q = QueryParser("content", index.schema, group=OrGroup).parse(string_query) results = searcher.search(q, limit=p) return [(r["doc_id"], round(r.score, 4)) for r in results]
def search(ix, term, algorithm, pageRank=False): if algorithm == 'BM25F': print("BM25F Search") with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=1000) return makeResults(results, pageRank, 25) return if algorithm == 'Frequency': print("Frequency Search") with ix.searcher(weighting=scoring.Frequency()) as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=1000) return makeResults(results, pageRank, 25) return if algorithm == 'TF_IDF': print("TF_IDF Search") with ix.searcher(weighting=scoring.TF_IDF()) as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=1000) return makeResults(results, pageRank, 25) return print('Incorrect Alogrithm')
def query_evaluator(file_directory,query_tsv_name,result_limit, *notitle): ''' This function is used to evaluate all queries by the 24 different Search Engine. :param file_directory: directory where the indexes are stored ("C:./Cranfield_DATASET/" and "C:./Time_DATASET/" ) :param query_tsv_name: Queries.tsv names (cran_Queries.tsv, time_Queries) :param result_limit: the number of top k results retrieved :param notitle: if notitle, it means only the body is considered (this is used for Time.csv dataset) :return: SEs = that is a dictionary where the keys are the Search Engines (ex. SE_01, SE_02, ..., SE_24) and the values their query document results ''' SEs = defaultdict(list) # dictionary where all the SEs will be stored query_path = file_directory+query_tsv_name # query_path = path where there are the queries ["C:./Cranfield_DATASET/cran_Queries.tsv", "C:./Time_DATASET/time_Queries.tsv"] analyzer_names = ['StemmingAnalyzer', 'StandardAnalyzer', 'RegexAnalyzer', 'SimpleAnalyzer', 'FancyAnalyzer', 'NgramAnalyzer', 'KeywordAnalyzer', 'LanguageAnalyzer'] # analyzers names counter = 1 # counter used to name the SEs for analyzer in analyzer_names: index_directory = file_directory+'inverted_index_'+analyzer #get the directory where the index is stored ix = index.open_dir(index_directory) # open the index inside the chosen directory scoring_functions = [scoring.TF_IDF(),scoring.Frequency(),scoring.BM25F(B=0.75,K1=1.2)] # list of chosen scoring functions # per each index three different scoring functions are used: for score in scoring_functions: scoring_function = score # select the scoring function if notitle: #this is fot Time dataset because only the body will be considered # query parser qp = QueryParser("body", ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field. else: # this is for the Cranfield dataset because both title and body will be considered # query parser qp = MultifieldParser(["title","body"], ix.schema) # here we are telling to the search engine in which fields it has to perform the query, if we use multifield we search in more than one field. # Create the searcher for the index based on the predefined scoring function searcher = ix.searcher(weighting=scoring_function) with open(query_path) as tsvfile: # here the .tsv containing the query is used and one by one are parsed querys = csv.reader(tsvfile, delimiter='\t') header = next(querys) # check if there is the header if header != None: # if there is the header iterate over all the rows in the Query.tsv file (cran_Queries.tsv, time_Queries) for query in querys: parsed_query = qp.parse(query[1]) # parsing the query (because up to now, the query is just a python string, and it has to be interpreted by the program. Because up to now it's just a boolean operator) results = searcher.search(parsed_query, limit=result_limit) # here the query is performed and only the top "result_limit" will be considered for hit in results: ''' here the relevant results will be selected. In particular: Query number, Doc ID, Rank and Score ''' output = [query[0],hit['id'], str(hit.rank + 1), str(hit.score)] SEs['SE_'+str(counter)].append(output) # the results are added to the predefined dictionary print('analyzer: '+analyzer, 'scoring_function: '+str(scoring_function).split('.')[2].split(' ')[0], '('+str(counter)+')') counter +=1 return(SEs)
def make_search_service(search_text): charmap = charset_table_to_dict(default_charset) custom_analyzers = StemmingAnalyzer() index_path = join(pathlib.Path(__file__).parent.parent.absolute(), 'indexdir') myindex = open_dir(index_path) qp = MultifieldParser(["title", "textdata"], schema=myindex.schema, group=AndGroup, fieldboosts={'title': 3.0, 'textdata': 0.8}) qstring = search_text q = qp.parse(qstring) results_list = [] myWeighting= scoring.MultiWeighting(scoring.BM25F(textdata_B=0.5), textdata=scoring.Frequency(), title=scoring.BM25F(title_B=2.0)) with myindex.searcher(weighting=myWeighting) as s: results = s.search(q, limit=30, terms=True) #forse cercavi e risultati relativi a corrected = s.correct_query(q, qstring) did_you_mean = str result_for = str if corrected.query != q: if len(results) < 1: results = s.search(qp.parse(corrected.string), limit=30, terms=True) result_for = corrected.string else: did_you_mean = corrected.string #query expansion keywords = [keyword for keyword, score in results.key_terms("textdata", docs=3, numterms=5)] if not keywords and keywords == " ": query_keyword = qp.parse(reduce(lambda a, b: a + ' ' + b, keywords)) results_keyword = s.search(query_keyword, limit=30, terms=True) results.upgrade_and_extend(results_keyword) #sorting key_sort = lambda result: result.score results = sorted(results, key=key_sort, reverse=True) for ris in results: result = {} result['title'] = ris['title'] result['url'] = ris['url'] result['id'] = ris['ID'] result['highlight'] = ris.highlights("textdata") results_list.append(result) #per calcolo precisione e recall id_results = [ris['id'] for ris in results_list[:10]] return { 'search_text': search_text, 'results': results_list, 'did_you_mean': did_you_mean, 'result_for': result_for, 'results_ids': id_results }
def search(ix, term): bm25f = {} print "BM25F" with ix.searcher() as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=25) index = 1 for r in results: print r['title'] bm25f[r['title']] = index index += 1 freq = {} print "\nFrequency" with ix.searcher(weighting=scoring.Frequency()) as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=25) index = 1 for r in results: print r['title'] freq[r['title']] = index index += 1 tf_idf = {} print "\nTF_IDF" with ix.searcher(weighting=scoring.TF_IDF()) as searcher: query = QueryParser('content', ix.schema).parse(term) results = searcher.search(query, limit=25) index = 1 for r in results: print r['title'] tf_idf[r['title']] = index index += 1 print '\nBM25F vs Frequency' difference = 0 for page in bm25f: if page in freq: difference += abs(bm25f[page] - freq[page]) print difference print '\nBM25F vs TF_IDF' difference = 0 for page in bm25f: if page in tf_idf: difference += abs(bm25f[page] - tf_idf[page]) print difference print '\nTF_IDF vs Frequency' difference = 0 for page in tf_idf: if page in freq: difference += abs(tf_idf[page] - freq[page]) print difference
def generate_score(self, query, measure, k=None): '''Generate scores for a given query according to a given measure''' if (measure == 'bm25'): score = self.rank(query, weighting=scoring.BM25F(), k=k) elif (measure == 'cos'): score = self.rank(query, weighting=scoring.TF_IDF(), k=k) elif (measure == 'freq'): score = self.rank(query, weighting=scoring.Frequency(), k=k) return score
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
def init_env(self): from whoosh import qparser, query, scoring from whoosh.analysis import RegexTokenizer from whoosh.lang.morph_en import variations self.freq_searcher = self.idx.searcher(weighting=scoring.Frequency()) self.tfidf_searcher = self.idx.searcher(weighting=scoring.TF_IDF()) self.bm25_searcher = self.idx.searcher( weighting=scoring.BM25F(B=0.74, K1=1.52)) self.query_parser = QueryParser('abstract', self.idx.schema) self.query_parser.add_plugin(FuzzyTermPlugin()) self.title_parser = QueryParser('title', self.idx.schema) self.title_parser.add_plugin(FuzzyTermPlugin()) self.tokenizer = RegexTokenizer()
def search_result(input_query, query, searchtype, directory): if searchtype == 'BM25': #logic return search(input_query, query, os.path.join(settings.MEDIA_ROOT, directory), scoring.BM25F()) elif searchtype == 'TFIDF': #logic return search(input_query, query, os.path.join(settings.MEDIA_ROOT, directory), scoring.TF_IDF()) elif searchtype == 'TF': #logic return search(input_query, query, os.path.join(settings.MEDIA_ROOT, directory), scoring.Frequency())
def test_dismax(): schema = fields.Schema(id=fields.STORED, f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) ix = RamStorage().create_index(schema) w = ix.writer() w.add_document(id=1, f1=u("alfa bravo charlie delta"), f2=u("alfa alfa alfa"), f3=u("alfa echo foxtrot hotel india")) w.commit() with ix.searcher(weighting=scoring.Frequency()) as s: assert_equal(list(s.documents(f1="alfa")), [{"id": 1}]) assert_equal(list(s.documents(f2="alfa")), [{"id": 1}]) assert_equal(list(s.documents(f3="alfa")), [{"id": 1}]) qs = [Term("f1", "alfa"), Term("f2", "alfa"), Term("f3", "alfa")] dm = DisjunctionMax(qs) r = s.search(dm) assert_equal(r.score(0), 3.0)
def test_dismax(self): schema = fields.Schema(id=fields.STORED, f1=fields.TEXT, f2=fields.TEXT, f3=fields.TEXT) st = RamStorage() ix = st.create_index(schema) w = ix.writer() w.add_document(id=1, f1=u"alfa bravo charlie delta", f2=u"alfa alfa alfa", f3=u"alfa echo foxtrot hotel india") w.commit() s = ix.searcher(weighting=scoring.Frequency()) qs = [Term("f1", "alfa"), Term("f2", "alfa"), Term("f3", "alfa")] r = s.search(DisjunctionMax(qs)) self.assertEqual(r.score(0), 3.0) r = s.search(DisjunctionMax(qs, tiebreak=0.5)) self.assertEqual(r.score(0), 3.0 + 0.5 + 1.5 + 0.5)
def test_all(): domain = [u("alfa"), u("bravo"), u("charlie"), u("delta"), u("echo"), u("foxtrot")] schema = fields.Schema(text=fields.TEXT) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() for _ in xrange(100): w.add_document(text=u(" ").join(choice(domain) for _ in xrange(randint(10, 20)))) w.commit() # List ABCs that should not be tested abcs = () # provide initializer arguments for any weighting classes that require them init_args = {"MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()}), "ReverseWeighting": ([scoring.BM25F()], {})} for wclass in _weighting_classes(abcs): try: if wclass.__name__ in init_args: args, kwargs = init_args[wclass.__name__] weighting = wclass(*args, **kwargs) else: weighting = wclass() except TypeError: e = sys.exc_info()[1] raise TypeError("Error instantiating %r: %s" % (wclass, e)) with ix.searcher(weighting=weighting) as s: try: for word in domain: s.search(query.Term("text", word)) except Exception: e = sys.exc_info()[1] e.msg = "Error searching with %r: %s" % (wclass, e) raise
def test_all(self): domain = [u"alfa", u"bravo", u"charlie", u"delta", u"echo", u"foxtrot"] schema = Schema(text=TEXT) storage = RamStorage() ix = storage.create_index(schema) w = ix.writer() for _ in xrange(100): w.add_document(text=u" ".join(choice(domain) for i in xrange(randint(10, 20)))) w.commit() # provide initializer arguments for any weighting classes that require them init_args = {"MultiWeighting": ([scoring.BM25F()], {"text": scoring.Frequency()})} reader = ix.reader() for wclass in self._weighting_classes(): if wclass.__name__ in init_args: args, kwargs = init_args[wclass.__name__] weighting = wclass(*args, **kwargs) else: weighting = wclass() searcher = Searcher(reader, weighting) for word in domain: r = searcher.search(query.Term("text", word))
from suggestCorrections import suggestCorrections from printResult import printResult if __name__ == "__main__": #ix = open_dir('Indexes') query = raw_input("Please Enter the query to search for:") input_query = query_AND(query, './Indexes/stopWordsWithoutStemming') suggestCorrections(input_query, query, './Indexes/stopWordsWithoutStemming') print "BM25 Results" result = search(input_query, query, './Indexes/stopWordsWithoutStemming', scoring.BM25F()) printResult(result, query) print "Phrasal Query Results" result = search(query_phrasal(query, './Indexes/stopWordsWithoutStemming'), query, './Indexes/stopWordsWithoutStemming', scoring.BM25F()) printResult(result, query) print "TF_IDF Results" result = search(input_query, query, './Indexes/stopWordsWithoutStemming', scoring.TF_IDF()) printResult(result, query) print "TF Results" result = search(input_query, query, './Indexes/stopWordsWithoutStemming', scoring.Frequency()) printResult(result, query)
def main(): args = parse_args() query = args.query number = args.number rank_func = args.rank_func index_loc = args.index_loc B = args.B weight_B = args.weight_B K1 = args.K1 if query is None: query_list = read_query() else: temp_str = ' ' query = temp_str.join(query) query_list = [query] if index_loc is None: index_loc = 'index' if weight_B is not None: rank_func = 1 if rank_func == 1: B1, B2, B3, B4, B5 = get_B(weight_B) weighting = scoring.BM25F(B=B, K1=K1, title_B=B1, body_B=B2, category_B=B3, date_B=B4, rating_B=B5) rank_name = 'bm25f' elif rank_func == 2: weighting = scoring.TF_IDF() rank_name = 'tf-idf' elif rank_func == 3: weighting = scoring.Frequency() rank_name = 'frequency' else: weighting = scoring.BM25F(B=B, K1=K1) rank_name = 'bm25' ix = open_dir(index_loc) with ix.searcher(weighting=weighting) as searcher: # parser = QueryParser(schema=ix.schema) parser = MultifieldParser( ['title', 'body', 'category', 'date', 'rating'], schema=ix.schema) for this_query in query_list: que = parser.parse(this_query) print('\n') print('--', this_query) results = searcher.search(que, limit=number) if len(results) == 0: print(' ') print('no matched result. please try again.') else: for hit in results: print(' ') print('#', hit.rank, rank_name, 'score:', round(hit.score, 5)) print('title:', hit['title']) print('date:', hit['date'], 'rating:', hit['rating'], 'category:', hit['category']) print('body:', hit['body'])
indexDir = basePath + "index/" docDir = basePath + "docs/" benchmarkPath = basePath + "benchmark-data.json" if arg["search"]: query = arg['<query>'] spellCheck = not arg["--no-spell-check"] batch = arg["--batch"] limit = parseLimit(arg["--limit"]) wildcard = not arg["--no-wildcard"] if arg['--tf-idf']: modelScoring = scoring.TF_IDF() elif arg['--freq']: modelScoring = scoring.Frequency() else: modelScoring = scoring.BM25F() if spellCheck: fixed = correct(query) if fixed != query: if batch or confirm("Did you mean: `" + fixed + "`?"): query = fixed print("Searching for `" + query + "`") index = openIndex(indexDir) r = search(query, index, modelScoring, limit, wildcard) for result in r: with open(result["path"], "r") as f:
def generate_scores(self, query, k=None): '''Generate scores for a given query according to BM25, TF IDF (under a cosine similarity) and Frequency rank functions''' bm25 = self.rank(query, weighting=scoring.BM25F(), k=k) cos = self.rank(query, weighting=scoring.TF_IDF(), k=k) freq = self.rank(query, weighting=scoring.Frequency(), k=k) return bm25, cos, freq
analyzer_names[i],schema) sw1_utils_schema.Fill_Empty_Schema(datasets[idx],directory_containing_the_index,\ datasets_len[idx]) dir_idx_list.append(directory_containing_the_index) ### ### Open the Index ### # for each of the datasets open index. And for each of the scoring functions # create a searcher, process queries and save retrieved results into Q_results. # Compute the MRR value for each of the search engine configurations Q_Res = [] config_names = [] # scoring functions sc_functions = [scoring.Frequency(), scoring.TF_IDF(), scoring.BM25F()] sc_fun_name = ['Frequency', 'TF_IDF', 'BM25F'] for idx in range(len(datasets)): max_number_of_results = datasets_len[idx] print('Search Engine Configuration' + "\t" + "\t" + "\t" + 'MRR') if datasets[idx] == 'Cranfield_DATASET': Q_dict = Cran_Q GT_dict = Cran_GT for elem in dir_idx_list[:3]: directory_containing_the_index = elem ix = index.open_dir(directory_containing_the_index) ### Select a Scoring-Function for s in range(len(sc_functions)): scoring_function = sc_functions[s] ### Create a Searcher for the Index with the selected Scoring-Function
import os import torch if __name__ == '__main__': Score = "bm25" # bm25, tfidf, tf Pivots_N = 10 # number of plausible English mentions Search_N = 500 # number of searched entities for each plausible English mention InputIndexDir = "data_process/DBIndex2" input_data_file = "Release/output_toy_de.json" output_data_file = "Release/output_toy_de_search.json" #------------------------------------------------------ if Score == "bm25": myscore = scoring.BM25F() elif Score =="tfidf": myscore = scoring.TF_IDF() elif Score == "tf": myscore = scoring.Frequency() elif Score == "multi": myscore = scoring.MultiWeighting(scoring.BM25F(), id=scoring.Frequency(), keys=scoring.TF_IDF()) else: myscore = scoring.BM25F() #---------------Input Query---------------------- schema = Schema(title=TEXT(stored=True, analyzer=StemmingAnalyzer()), content=TEXT(stored=True)) All_Result = [] ix = open_dir(InputIndexDir) sf = torch.nn.Softmax(dim=0) alldata = read_json(input_data_file) with ix.searcher(weighting=myscore) as searcher: parser = QueryParser("title", ix.schema,group=qparser.OrGroup) for item in tqdm(alldata):
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \ only_title_flag = 0, \ directory_containing_the_index = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \ query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \ gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \ doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \ conf_label = "Not Specified", mrr_eps = .32, \ k_interval_for_nDCG = range(1,151)): ### ### Create a Schema ### schema = Schema(id=ID(stored=True), \ title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer)) ### ### Create an empty-Index ### according to the just defined Schema ;) ### ix = create_in(directory_containing_the_index, schema) ### ### Get the query set (reset index due to missing values in the IDs) ### query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index() ### ### Get the ground truth (little manipulation to group by query and allign IDs) ### gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t") gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict() gt = defaultdict(list) j = 1 for i in range(len(gt_tmp)): while(gt[i] == []): try: gt[i] = gt_tmp[j] j+=1 except KeyError: j += 1 number_of_queries = len(query_set) num_of_docs = 1400 ### ### We'll iterate on the following lists to swicth SE scoring function and get their names ### scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()] scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list] ### ### Fill the Index ### writer = ix.writer() for doc in range(num_of_docs): id_ = str(doc+1) title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html") writer.add_document(id=id_, title = title, content = content) writer.commit() ### ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config ### results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)]) evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs ### ### Run the SEs ### for idx_s,scorer in enumerate(scoring_functions_list): for idx,query in enumerate(query_set["Query"]): input_query = query ### ### Select a Scoring-Function ### scoring_function = scorer ### ### Create a QueryParser for ### parsing the input_query based on user SE choosen configuration. ### if multifield_flag: qp = MultifieldParser(["title","content"], ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: if only_title_flag: qp = SimpleParser("title", ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: qp = SimpleParser("content", ix.schema) parsed_query = qp.parse(input_query)# parsing the query ### ### Create a Searcher for the Index ### with the selected Scoring-Function ### searcher = ix.searcher(weighting=scoring_function) ### ### Perform a Search and store results ### results = searcher.search(parsed_query, limit=max_res) results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results] searcher.close() mrr_res = mrr(results_mat[:,:,idx_s],gt) if mrr_res >= mrr_eps: ### ### Compute and summarize R-precision distro ### r_res = r_precision(results_mat[:,:,idx_s],gt) mean = np.mean(list(r_res.values())) first_q = np.percentile(list(r_res.values()),25) third_q = np.percentile(list(r_res.values()),75) median = np.median(list(r_res.values())) minr = min(list(r_res.values())) maxr = max(list(r_res.values())) evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr] ### ### Compute nDCG@k for varying k and for each scoring function ### for k in k_interval_for_nDCG: tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values())) ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res) else: evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res] ### ### Just to see what's happening ### print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res)) return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs
def __init__(self): try: self.ix = index.open_dir(INDEX_BASE_DIR) except Exception as e: logger.error("Could not open index file: %s" % e) logger.info( "To be able to search, an index has to be created first. Use index_website.py to create the index." ) raise e self.scorers_dict = { SearchEngine.FREQUENCY: scoring.Frequency(), SearchEngine.BM25: scoring.BM25F(), SearchEngine.TF_IDF: scoring.TF_IDF(), SearchEngine.PL2: scoring.PL2(), SearchEngine.PAGERANK: scoring.Frequency(), # Change the scoring with the custom scoring, once implemented SearchEngine.CUSTOM: scoring.MultiWeighting( default=scoring.BM25F(), # content=scoring.PL2(), # content_stem=scoring.PL2() ) } self.rankings = self.scorers_dict.keys() self.qp = MultifieldParser([ "title_stem", "description_stem", "keywords_stem", "content_stem" ], schema=schema) recall = 1 precision = 2 fieldboosts = { "title": 2.0, "description": 1.3, "keywords": 1.5, "links_in_keywords": 1.5, "content": 1.0, "title_stem": 1.2, "description_stem": 1.1, "keywords_stem": 1.2, "links_in_keywords_stem": 1.1, "content_stem": 1.0 } total_standard = sum([ value for key, value in fieldboosts.items() if not key.endswith('_stem') ]) total_stem = sum([ value for key, value in fieldboosts.items() if key.endswith('_stem') ]) for key, value in fieldboosts.items(): if key.endswith('_stem'): fieldboosts[key] = (fieldboosts[key] / total_stem) * (recall / (recall + precision)) else: fieldboosts[key] = (fieldboosts[key] / total_standard) * (precision / (recall + precision)) self.qp_custom = MultifieldParser([ "title", "description", "keywords", "links_in_keywords", "content", "title_stem", "description_stem", "keywords_stem", "links_in_keywords_stem", "content_stem" ], schema=schema, fieldboosts=fieldboosts)