def searcher(self): path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname) ix = index.open_dir(path) self.srch = ix.searcher(weighting=scoring.PL2()) self.parser = qparser.QueryParser(self.bench.spec.main_field, schema=ix.schema)
def create_scoring_function(s): """ Returns a scoring function object, depending of the value of s. """ if s == 2: w = scoring.PL2(c=10.0) else: w = scoring.BM25F(B=0.75, K1=1.5) return w
def createScoreFunction(s): if (s == 1): w = scoring.BM25F(B=0.75, K1=1.5) print "BM25" else: if (s == 2): w = scoring.PL2(c=10.0) print "PL2" else: w = scoring.BM25F(B=0.75, K1=1.5) print "BM25" return w
def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True): ix = index.open_dir(ixdir) qp = qparser.QueryParser("title", ix.schema) q = qp.parse(qstring) with ix.searcher(weighting=scoring.PL2()) as s: if scores: r = s.search(q, limit=limit, optimize=optimize) for hit in r: print_record(hit.rank, basedir, hit["file"], hit["pos"]) print("Found %d records in %0.06f seconds" % (len(r), r.runtime)) else: t = now() for i, docnum in enumerate(s.docs_for_query(q)): if not limit or i < limit: fields = s.stored_fields(docnum) print_record(i, basedir, fields["file"], fields["pos"]) print("Found %d records in %0.06f seconds" % (i, now() - t))
def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs): Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified") self.stopwords_file = stopwords_file if self.stopwords_file: self.stopwords = ListReader(self.stopwords_file) # Open the stopwords file, read into a ListReader else: raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified") self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) self.__verbose = False try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser('content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir) raise EngineConnectionException(self.name, message) # Attempt to connect to the specified Redis cache. self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect()
def ricerca(filename, content, rank_text): start_ricerca = time.time() passo = [] ix = open_dir(filename) #searcher = ix.searcher() #print(list(searcher.lexicon("title"))) Fa una lista di tutti gli index term trovati if content == "": results = ix.searcher().search(Every('key'), limit=None) elif rank_text.get() == 'BM25F': results = ix.searcher(weighting=scoring.BM25F()).search( QueryParser('content', schema=ix.schema, group=qparser.OrGroup).parse(content), limit=None) elif rank_text.get() == 'PL2': results = ix.searcher(weighting=scoring.PL2()).search( QueryParser('content', schema=ix.schema, group=qparser.OrGroup).parse(content), limit=None) start = time.time() for i in results: passo.append(i) end = time.time() print('tempo passo : ', end - start) end_ricerca = time.time() Query.time_ricerca = (end_ricerca - start_ricerca) print("tempo ricerca", Query.time_ricerca) return results
def __init__(self, whoosh_index_dir='', use_cache=True, cache_host='localhost', cache_port=6379, **kwargs): """ Constructor for the engine. """ Engine.__init__(self, **kwargs) self.whoosh_index_dir = whoosh_index_dir if not self.whoosh_index_dir: raise EngineConnectionException( self.name, "'whoosh_index_dir=' keyword argument not specified") # Only put PL2 in for now (for more, add the model parameter to the constructor to specify!) self.scoring_model_identifier = 1 self.scoring_model = scoring.PL2(c=10.0) try: self.doc_index = open_dir(self.whoosh_index_dir) self.reader = self.doc_index.reader() self.parser = QueryParser( 'content', self.doc_index.schema) # By default, we use AND grouping. # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc... # Objects required for document snippet generation self.analyzer = self.doc_index.schema[ self.parser.fieldname].analyzer self.fragmenter = ContextFragmenter(maxchars=200, surround=40) self.formatter = HtmlFormatter() except EmptyIndexError: message = "Could not open Whoosh index at '{0}'".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) except OSError: message = "Could not open Whoosh index at '{0}' - directory does not exist".format( self.whoosh_index_dir) raise EngineConnectionException(self.name, message) self.use_cache = use_cache if self.use_cache: self.cache = RedisConn(host=cache_host, port=cache_port) self.cache.connect() self.page_cache_forward_look = 40 # How many additional pages to cache when required. self.page_cache_when = 4 # When the user is x pages away from the end of the page cache, cache more pages. self.page_cache_controller = PageCacheController( cache_host=self.cache.host, cache_port=self.cache.port, whoosh_index=self.doc_index, scoring_model_identifier=self.scoring_model_identifier, parser=self.parser, analyzer=self.analyzer, fragmenter=self.fragmenter, formatter=self.formatter, cache_forward_look=self.page_cache_forward_look)
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \ only_title_flag = 0, \ directory_containing_the_index = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \ query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \ gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \ doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \ conf_label = "Not Specified", mrr_eps = .32, \ k_interval_for_nDCG = range(1,151)): ### ### Create a Schema ### schema = Schema(id=ID(stored=True), \ title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer)) ### ### Create an empty-Index ### according to the just defined Schema ;) ### ix = create_in(directory_containing_the_index, schema) ### ### Get the query set (reset index due to missing values in the IDs) ### query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index() ### ### Get the ground truth (little manipulation to group by query and allign IDs) ### gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t") gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict() gt = defaultdict(list) j = 1 for i in range(len(gt_tmp)): while(gt[i] == []): try: gt[i] = gt_tmp[j] j+=1 except KeyError: j += 1 number_of_queries = len(query_set) num_of_docs = 1400 ### ### We'll iterate on the following lists to swicth SE scoring function and get their names ### scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()] scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list] ### ### Fill the Index ### writer = ix.writer() for doc in range(num_of_docs): id_ = str(doc+1) title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html") writer.add_document(id=id_, title = title, content = content) writer.commit() ### ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config ### results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)]) evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs ### ### Run the SEs ### for idx_s,scorer in enumerate(scoring_functions_list): for idx,query in enumerate(query_set["Query"]): input_query = query ### ### Select a Scoring-Function ### scoring_function = scorer ### ### Create a QueryParser for ### parsing the input_query based on user SE choosen configuration. ### if multifield_flag: qp = MultifieldParser(["title","content"], ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: if only_title_flag: qp = SimpleParser("title", ix.schema) parsed_query = qp.parse(input_query)# parsing the query else: qp = SimpleParser("content", ix.schema) parsed_query = qp.parse(input_query)# parsing the query ### ### Create a Searcher for the Index ### with the selected Scoring-Function ### searcher = ix.searcher(weighting=scoring_function) ### ### Perform a Search and store results ### results = searcher.search(parsed_query, limit=max_res) results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results] searcher.close() mrr_res = mrr(results_mat[:,:,idx_s],gt) if mrr_res >= mrr_eps: ### ### Compute and summarize R-precision distro ### r_res = r_precision(results_mat[:,:,idx_s],gt) mean = np.mean(list(r_res.values())) first_q = np.percentile(list(r_res.values()),25) third_q = np.percentile(list(r_res.values()),75) median = np.median(list(r_res.values())) minr = min(list(r_res.values())) maxr = max(list(r_res.values())) evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr] ### ### Compute nDCG@k for varying k and for each scoring function ### for k in k_interval_for_nDCG: tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values())) ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res) else: evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res] ### ### Just to see what's happening ### print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res)) return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs
def __init__(self): try: self.ix = index.open_dir(INDEX_BASE_DIR) except Exception as e: logger.error("Could not open index file: %s" % e) logger.info( "To be able to search, an index has to be created first. Use index_website.py to create the index." ) raise e self.scorers_dict = { SearchEngine.FREQUENCY: scoring.Frequency(), SearchEngine.BM25: scoring.BM25F(), SearchEngine.TF_IDF: scoring.TF_IDF(), SearchEngine.PL2: scoring.PL2(), SearchEngine.PAGERANK: scoring.Frequency(), # Change the scoring with the custom scoring, once implemented SearchEngine.CUSTOM: scoring.MultiWeighting( default=scoring.BM25F(), # content=scoring.PL2(), # content_stem=scoring.PL2() ) } self.rankings = self.scorers_dict.keys() self.qp = MultifieldParser([ "title_stem", "description_stem", "keywords_stem", "content_stem" ], schema=schema) recall = 1 precision = 2 fieldboosts = { "title": 2.0, "description": 1.3, "keywords": 1.5, "links_in_keywords": 1.5, "content": 1.0, "title_stem": 1.2, "description_stem": 1.1, "keywords_stem": 1.2, "links_in_keywords_stem": 1.1, "content_stem": 1.0 } total_standard = sum([ value for key, value in fieldboosts.items() if not key.endswith('_stem') ]) total_stem = sum([ value for key, value in fieldboosts.items() if key.endswith('_stem') ]) for key, value in fieldboosts.items(): if key.endswith('_stem'): fieldboosts[key] = (fieldboosts[key] / total_stem) * (recall / (recall + precision)) else: fieldboosts[key] = (fieldboosts[key] / total_standard) * (precision / (recall + precision)) self.qp_custom = MultifieldParser([ "title", "description", "keywords", "links_in_keywords", "content", "title_stem", "description_stem", "keywords_stem", "links_in_keywords_stem", "content_stem" ], schema=schema, fieldboosts=fieldboosts)
str(id_q), hit['id'], str(hit.rank + 1), str(hit.score) ]) searcher.close() print('Search Engine n.{} -->', "se_{}_{}.csv".format( (i + 1), schema_t.format(i + 1)), ' is ready!') print() # Different scoring functions that we've used to compute different Search Engines scores = [ scoring.BM25F(), scoring.Frequency(), scoring.PL2(), scoring.TF_IDF(), scoring.FunctionWeighting(pos_score_fn) ] # Directories containing queries dir_csv_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv' dir_csv_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Time_DATASET\time_Queries.tsv' # Directories use to store the S.E. dir_se_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\SE Cran' dir_se_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\SE Time' # *Cranfield Dataset* # key -> schema and score , value -> name of file se_configuration = {}