Exemple #1
0
 def searcher(self):
     path = os.path.join(self.options.dir,
                         "%s_whoosh" % self.options.indexname)
     ix = index.open_dir(path)
     self.srch = ix.searcher(weighting=scoring.PL2())
     self.parser = qparser.QueryParser(self.bench.spec.main_field,
                                       schema=ix.schema)
Exemple #2
0
def create_scoring_function(s):
	"""
	Returns a scoring function object, depending of the value of s.
	"""
	if s == 2:
		w = scoring.PL2(c=10.0)
	else:
		w = scoring.BM25F(B=0.75, K1=1.5)
	
	return w
Exemple #3
0
def createScoreFunction(s):
    if (s == 1):
        w = scoring.BM25F(B=0.75, K1=1.5)
        print "BM25"
    else:
        if (s == 2):
            w = scoring.PL2(c=10.0)
            print "PL2"
        else:
            w = scoring.BM25F(B=0.75, K1=1.5)
            print "BM25"
    return w
Exemple #4
0
def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
    ix = index.open_dir(ixdir)
    qp = qparser.QueryParser("title", ix.schema)
    q = qp.parse(qstring)

    with ix.searcher(weighting=scoring.PL2()) as s:
        if scores:
            r = s.search(q, limit=limit, optimize=optimize)
            for hit in r:
                print_record(hit.rank, basedir, hit["file"], hit["pos"])
            print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
        else:
            t = now()
            for i, docnum in enumerate(s.docs_for_query(q)):
                if not limit or i < limit:
                    fields = s.stored_fields(docnum)
                    print_record(i, basedir, fields["file"], fields["pos"])
            print("Found %d records in %0.06f seconds" % (i, now() - t))
Exemple #5
0
    def __init__(self, whoosh_index_dir='', stopwords_file='', cache_host='localhost', cache_port=6379, **kwargs):
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(self.name, "'whoosh_index_dir=' keyword argument not specified")

        self.stopwords_file = stopwords_file
        if self.stopwords_file:
            self.stopwords = ListReader(self.stopwords_file)  # Open the stopwords file, read into a ListReader
        else:
            raise EngineConnectionException(self.name, "'stopwords_file=' keyword argument not specified")

        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)
        
        self.__verbose = False

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser('content', self.doc_index.schema)  # By default, we use AND grouping.
                                                                         # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        # Attempt to connect to the specified Redis cache.
        self.cache = RedisConn(host=cache_host, port=cache_port)
        self.cache.connect()
Exemple #6
0
    def ricerca(filename, content, rank_text):
        start_ricerca = time.time()
        passo = []

        ix = open_dir(filename)
        #searcher = ix.searcher()
        #print(list(searcher.lexicon("title"))) Fa una lista di tutti gli index term trovati

        if content == "":
            results = ix.searcher().search(Every('key'), limit=None)

        elif rank_text.get() == 'BM25F':

            results = ix.searcher(weighting=scoring.BM25F()).search(
                QueryParser('content', schema=ix.schema,
                            group=qparser.OrGroup).parse(content),
                limit=None)

        elif rank_text.get() == 'PL2':

            results = ix.searcher(weighting=scoring.PL2()).search(
                QueryParser('content', schema=ix.schema,
                            group=qparser.OrGroup).parse(content),
                limit=None)

        start = time.time()
        for i in results:
            passo.append(i)
        end = time.time()

        print('tempo passo : ', end - start)
        end_ricerca = time.time()
        Query.time_ricerca = (end_ricerca - start_ricerca)
        print("tempo ricerca", Query.time_ricerca)

        return results
Exemple #7
0
    def __init__(self,
                 whoosh_index_dir='',
                 use_cache=True,
                 cache_host='localhost',
                 cache_port=6379,
                 **kwargs):
        """
        Constructor for the engine.
        """
        Engine.__init__(self, **kwargs)

        self.whoosh_index_dir = whoosh_index_dir
        if not self.whoosh_index_dir:
            raise EngineConnectionException(
                self.name,
                "'whoosh_index_dir=' keyword argument not specified")

        #  Only put PL2 in for now (for more, add the model parameter to the constructor to specify!)
        self.scoring_model_identifier = 1
        self.scoring_model = scoring.PL2(c=10.0)

        try:
            self.doc_index = open_dir(self.whoosh_index_dir)
            self.reader = self.doc_index.reader()
            self.parser = QueryParser(
                'content',
                self.doc_index.schema)  # By default, we use AND grouping.
            # Use the grouping parameter and specify whoosh.qparser.OrGroup, etc...

            #  Objects required for document snippet generation
            self.analyzer = self.doc_index.schema[
                self.parser.fieldname].analyzer
            self.fragmenter = ContextFragmenter(maxchars=200, surround=40)
            self.formatter = HtmlFormatter()
        except EmptyIndexError:
            message = "Could not open Whoosh index at '{0}'".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)
        except OSError:
            message = "Could not open Whoosh index at '{0}' - directory does not exist".format(
                self.whoosh_index_dir)
            raise EngineConnectionException(self.name, message)

        self.use_cache = use_cache
        if self.use_cache:
            self.cache = RedisConn(host=cache_host, port=cache_port)
            self.cache.connect()

            self.page_cache_forward_look = 40  # How many additional pages to cache when required.
            self.page_cache_when = 4  # When the user is x pages away from the end of the page cache, cache more pages.

            self.page_cache_controller = PageCacheController(
                cache_host=self.cache.host,
                cache_port=self.cache.port,
                whoosh_index=self.doc_index,
                scoring_model_identifier=self.scoring_model_identifier,
                parser=self.parser,
                analyzer=self.analyzer,
                fragmenter=self.fragmenter,
                formatter=self.formatter,
                cache_forward_look=self.page_cache_forward_look)
Exemple #8
0
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \
                  only_title_flag = 0, \
                  directory_containing_the_index  = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \
                  query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \
                  gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \
                  doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \
                  conf_label = "Not Specified",
                  mrr_eps = .32, \
                  k_interval_for_nDCG = range(1,151)):
   
    
    ###
    ### Create a Schema 
    ###
    schema = Schema(id=ID(stored=True), \
                    title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer))
    
    ###
    ### Create an empty-Index 
    ### according to the just defined Schema ;)
    ### 
    ix = create_in(directory_containing_the_index, schema)
    
    
    ###
    ### Get the query set (reset index due to missing values in the IDs)
    ###
    query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index()
    
    
    ###
    ### Get the ground truth (little manipulation to group by query and allign IDs)
    ###
    gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t")
    gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict()
    gt = defaultdict(list)
    j = 1
    for i in range(len(gt_tmp)):
        while(gt[i] == []):
            try:
                gt[i] = gt_tmp[j]
                j+=1
            except KeyError:
                j += 1
    
    
    
    number_of_queries = len(query_set)
    num_of_docs = 1400
    
    ###
    ### We'll iterate on the following lists to swicth SE scoring function and get their names
    ###
    scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()]
    scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list]
    
    
    ###
    ### Fill the Index
    ###
    writer = ix.writer()
    for doc in range(num_of_docs):
        id_ = str(doc+1)
        title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html")
        writer.add_document(id=id_, title = title, content = content)
    writer.commit()
    
    
    
    ###
    ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config
    ###
    results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)])
    
   
    evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries
    ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs

    ###
    ### Run the SEs
    ###
    for idx_s,scorer in enumerate(scoring_functions_list):
        for idx,query in enumerate(query_set["Query"]):
            
            input_query = query
            
            ###
            ### Select a Scoring-Function
            ###
            scoring_function = scorer
            
            ###
            ### Create a QueryParser for 
            ### parsing the input_query based on user SE choosen configuration.
            ###
            if multifield_flag:
                qp = MultifieldParser(["title","content"], ix.schema)
                parsed_query = qp.parse(input_query)# parsing the query
            else:
                if only_title_flag:
                    qp = SimpleParser("title", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                else:
                    qp = SimpleParser("content", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                
            ###
            ### Create a Searcher for the Index
            ### with the selected Scoring-Function 
            ###
            searcher = ix.searcher(weighting=scoring_function)
            
            ###
            ### Perform a Search and store results
            ###
            results = searcher.search(parsed_query, limit=max_res)
            results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results]
            searcher.close()
        mrr_res = mrr(results_mat[:,:,idx_s],gt)
        
        if mrr_res >= mrr_eps:
            
            ###
            ### Compute and summarize R-precision distro
            ###
            r_res = r_precision(results_mat[:,:,idx_s],gt)
            mean = np.mean(list(r_res.values()))
            first_q = np.percentile(list(r_res.values()),25)
            third_q = np.percentile(list(r_res.values()),75)
            median = np.median(list(r_res.values()))
            minr = min(list(r_res.values()))
            maxr = max(list(r_res.values()))
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr]
            
            ###
            ### Compute nDCG@k for varying k and for each scoring function
            ###
            for k in k_interval_for_nDCG:
                tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values()))
                ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res)
            
        else:
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res]
        
        ###
        ### Just to see what's happening
        ###
        print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res))
        
    return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs 
Exemple #9
0
    def __init__(self):
        try:
            self.ix = index.open_dir(INDEX_BASE_DIR)
        except Exception as e:
            logger.error("Could not open index file: %s" % e)
            logger.info(
                "To be able to search, an index has to be created first. Use index_website.py to create the index."
            )
            raise e

        self.scorers_dict = {
            SearchEngine.FREQUENCY:
            scoring.Frequency(),
            SearchEngine.BM25:
            scoring.BM25F(),
            SearchEngine.TF_IDF:
            scoring.TF_IDF(),
            SearchEngine.PL2:
            scoring.PL2(),
            SearchEngine.PAGERANK:
            scoring.Frequency(),
            # Change the scoring with the custom scoring, once implemented
            SearchEngine.CUSTOM:
            scoring.MultiWeighting(
                default=scoring.BM25F(),
                # content=scoring.PL2(),
                # content_stem=scoring.PL2()
            )
        }

        self.rankings = self.scorers_dict.keys()
        self.qp = MultifieldParser([
            "title_stem", "description_stem", "keywords_stem", "content_stem"
        ],
                                   schema=schema)

        recall = 1
        precision = 2
        fieldboosts = {
            "title": 2.0,
            "description": 1.3,
            "keywords": 1.5,
            "links_in_keywords": 1.5,
            "content": 1.0,
            "title_stem": 1.2,
            "description_stem": 1.1,
            "keywords_stem": 1.2,
            "links_in_keywords_stem": 1.1,
            "content_stem": 1.0
        }

        total_standard = sum([
            value for key, value in fieldboosts.items()
            if not key.endswith('_stem')
        ])
        total_stem = sum([
            value for key, value in fieldboosts.items()
            if key.endswith('_stem')
        ])

        for key, value in fieldboosts.items():
            if key.endswith('_stem'):
                fieldboosts[key] = (fieldboosts[key] /
                                    total_stem) * (recall /
                                                   (recall + precision))
            else:
                fieldboosts[key] = (fieldboosts[key] /
                                    total_standard) * (precision /
                                                       (recall + precision))

        self.qp_custom = MultifieldParser([
            "title", "description", "keywords", "links_in_keywords", "content",
            "title_stem", "description_stem", "keywords_stem",
            "links_in_keywords_stem", "content_stem"
        ],
                                          schema=schema,
                                          fieldboosts=fieldboosts)
Exemple #10
0
                        str(id_q), hit['id'],
                        str(hit.rank + 1),
                        str(hit.score)
                    ])

        searcher.close()
        print('Search Engine n.{} -->', "se_{}_{}.csv".format(
            (i + 1), schema_t.format(i + 1)), ' is ready!')
        print()


# Different scoring functions that we've used to compute different Search Engines
scores = [
    scoring.BM25F(),
    scoring.Frequency(),
    scoring.PL2(),
    scoring.TF_IDF(),
    scoring.FunctionWeighting(pos_score_fn)
]

# Directories containing queries
dir_csv_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv'
dir_csv_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\HW_1\part_1\Time_DATASET\time_Queries.tsv'

# Directories use to store the S.E.
dir_se_cran = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\SE Cran'
dir_se_time = r'C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Secondo semestre\DMT\Project\SE Time'

# *Cranfield Dataset*
# key -> schema and score , value -> name of file
se_configuration = {}