Example #1
0
 def __init__(self, fieldname):
     '''
     Constructor
     '''
     self.w_parser = SimpleParser(fieldname, None)
     self.w_parser.add_plugin(FieldsPlugin())
     self.w_parser.add_plugin(OperatorsPlugin())
     self.w_parser.add_plugin(PhrasePlugin())
     self.w_parser.add_plugin(SingleQuotePlugin())
     self.w_parser.add_plugin(GroupPlugin())
     self.w_parser.add_plugin(PrefixPlugin())
     self.w_parser.add_plugin(GtLtPlugin())
     self.w_parser.add_plugin(RangePlugin())
     self.query = None
     self.current_node_stack = []
Example #2
0
 def read(self, fieldnames, query, callback):
     r = None
     with self.ix.searcher() as searcher:
         start = time.time()
         query = SimpleParser(fieldnames, self._whoosh_schema).parse(query)
         end = time.time()
         print('query', ' took', str(end - start), 'time')
         callback(searcher.search(query))
def searchBodyAndHighlight(q):
	parser = SimpleParser("body", schema=ix.schema)
	q = parser.parse(q)
	terms = [text for fieldname, text in q.all_terms()
	        if fieldname == "body"]

	r = s.search(q)
	analyzer = schema["body"].format.analyzer
	print "will tokenize with",q.all_terms
	fragmenter = highlight.ContextFragmenter(q.all_terms,400,80)
	# formatter = highlight.HtmlFormatter()
	formatter = colorIpythonFormatter

	for d in r:
		# The text argument to highlight is the stored text of the title
		text = d["body"]
		res= highlight.highlight(text, terms, analyzer,fragmenter, formatter)
		# print res.encode("latin-1","replace")
		print unicodedata.normalize('NFKC', res).encode("utf-8","replace")
		print "-"*8
Example #4
0
 def read(self, fieldnames, query, callback):
     start = time.time()
     ix = open_dir(self._need_base, self._need_index)
     end = time.time()
     print('opendir', ' took', str(end - start), 'time')
     r = None
     with ix.searcher() as searcher:
         start = time.time()
         #query = QueryParser(field, self._whoosh_schema).parse(query)
         query = SimpleParser(fieldnames, self._whoosh_schema).parse(query)
         end = time.time()
         print('query', ' took', str(end - start), 'time')
         callback(searcher.search(query))
def search(q):
	s = ix.searcher()
	parser = SimpleParser("body", schema=ix.schema)
	q = parser.parse(q)
	terms = [text for fieldname, text in q.all_terms()
	        if fieldname == "body"]

	r = s.search(q)
	analyzer = schema["body"].format.analyzer
	# fragmenter = highlight.ContextFragmenter(q.all_terms,500,40)
	fragmenter=highlight.SentenceFragmenter()
	
	#just extract sentences for the first one
	search_results=[]
	for d in r:
		# The text argument to highlight is the stored text of the title
		text = d["body"]
		path=d["path"]
		for ex in highlight_extracts(path,text,terms):
			search_results.append((path,ex[0],ex[1],ex[2],ex[3]))
	search_results.sort(key=lambda x:x[1],reverse=True)
	return search_results[:20],terms
Example #6
0
def search_engine( analyzer = StemmingAnalyzer(), max_res = 150, multifield_flag = 1, \
                  only_title_flag = 0, \
                  directory_containing_the_index  = r"C:\Users\claba\Desktop\DMT works\HW_1\Index_part_1", \
                  query_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Queries.tsv", \
                  gt_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\cran_Ground_Truth.tsv", \
                  doc_dir = r"C:\Users\claba\Desktop\DMT works\HW_1\part_1\Cranfield_DATASET\DOCUMENTS\\", \
                  conf_label = "Not Specified",
                  mrr_eps = .32, \
                  k_interval_for_nDCG = range(1,151)):
   
    
    ###
    ### Create a Schema 
    ###
    schema = Schema(id=ID(stored=True), \
                    title = TEXT(stored=False, analyzer=analyzer),content=TEXT(stored=False, analyzer=analyzer))
    
    ###
    ### Create an empty-Index 
    ### according to the just defined Schema ;)
    ### 
    ix = create_in(directory_containing_the_index, schema)
    
    
    ###
    ### Get the query set (reset index due to missing values in the IDs)
    ###
    query_set = pd.read_csv(query_dir, engine = "python", sep = "\t", index_col="Query_ID").reset_index()
    
    
    ###
    ### Get the ground truth (little manipulation to group by query and allign IDs)
    ###
    gt_tmp = pd.read_csv(gt_dir, engine = "python", sep = "\t")
    gt_tmp = gt_tmp.groupby('Query_id')['Relevant_Doc_id'].apply(lambda x: x.tolist()).to_dict()
    gt = defaultdict(list)
    j = 1
    for i in range(len(gt_tmp)):
        while(gt[i] == []):
            try:
                gt[i] = gt_tmp[j]
                j+=1
            except KeyError:
                j += 1
    
    
    
    number_of_queries = len(query_set)
    num_of_docs = 1400
    
    ###
    ### We'll iterate on the following lists to swicth SE scoring function and get their names
    ###
    scoring_functions_list = [scoring.PL2(), scoring.Frequency(), scoring.BM25F(), scoring.TF_IDF()]
    scoring_name = [re.findall(r"(?<=scoring\.)[\w\W]*(?=object)", str(score))[0] for score in scoring_functions_list]
    
    
    ###
    ### Fill the Index
    ###
    writer = ix.writer()
    for doc in range(num_of_docs):
        id_ = str(doc+1)
        title,content = doc_retriver(doc_dir+"______"+str(doc+1)+".html")
        writer.add_document(id=id_, title = title, content = content)
    writer.commit()
    
    
    
    ###
    ### This """tensor""" allows to store all the results we need. It's dimension are #ResultsX#QueriesX#SE_config
    ###
    results_mat = np.zeros([max_res,number_of_queries,len(scoring_functions_list)])
    
   
    evaluations_summary = {} # Dict to store MRR and R-Precision Distro sumamries
    ndcg = defaultdict(list) # Def Dict that will contain nDCG values for varying K values for all MRR >.32 SEs

    ###
    ### Run the SEs
    ###
    for idx_s,scorer in enumerate(scoring_functions_list):
        for idx,query in enumerate(query_set["Query"]):
            
            input_query = query
            
            ###
            ### Select a Scoring-Function
            ###
            scoring_function = scorer
            
            ###
            ### Create a QueryParser for 
            ### parsing the input_query based on user SE choosen configuration.
            ###
            if multifield_flag:
                qp = MultifieldParser(["title","content"], ix.schema)
                parsed_query = qp.parse(input_query)# parsing the query
            else:
                if only_title_flag:
                    qp = SimpleParser("title", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                else:
                    qp = SimpleParser("content", ix.schema)
                    parsed_query = qp.parse(input_query)# parsing the query
                
            ###
            ### Create a Searcher for the Index
            ### with the selected Scoring-Function 
            ###
            searcher = ix.searcher(weighting=scoring_function)
            
            ###
            ### Perform a Search and store results
            ###
            results = searcher.search(parsed_query, limit=max_res)
            results_mat[0:len(results),idx,idx_s] = [hit["id"] for hit in results]
            searcher.close()
        mrr_res = mrr(results_mat[:,:,idx_s],gt)
        
        if mrr_res >= mrr_eps:
            
            ###
            ### Compute and summarize R-precision distro
            ###
            r_res = r_precision(results_mat[:,:,idx_s],gt)
            mean = np.mean(list(r_res.values()))
            first_q = np.percentile(list(r_res.values()),25)
            third_q = np.percentile(list(r_res.values()),75)
            median = np.median(list(r_res.values()))
            minr = min(list(r_res.values()))
            maxr = max(list(r_res.values()))
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res,mean,minr,first_q,median,third_q,maxr]
            
            ###
            ### Compute nDCG@k for varying k and for each scoring function
            ###
            for k in k_interval_for_nDCG:
                tmp_res = np.mean(list(nDCG(results_mat[:,:,idx_s],gt,k = k).values()))
                ndcg[conf_label+","+scoring_name[idx_s]].append(tmp_res)
            
        else:
            evaluations_summary[conf_label+","+scoring_name[idx_s]] = [mrr_res]
        
        ###
        ### Just to see what's happening
        ###
        print("Configuration:"+conf_label+","+scoring_name[idx_s]+"==> MRR = "+str(mrr_res))
        
    return evaluations_summary, ndcg # The evaluation result, obviously, contains oly MRR for <.32 SEs 
Example #7
0
 def get_parser(self):
     return SimpleParser('name', schema=OffersSchema())
Example #8
0
class QueryParser(object):
    def __init__(self, fieldname):
        '''
        Constructor
        '''
        self.w_parser = SimpleParser(fieldname, None)
        self.w_parser.add_plugin(FieldsPlugin())
        self.w_parser.add_plugin(OperatorsPlugin())
        self.w_parser.add_plugin(PhrasePlugin())
        self.w_parser.add_plugin(SingleQuotePlugin())
        self.w_parser.add_plugin(GroupPlugin())
        self.w_parser.add_plugin(PrefixPlugin())
        self.w_parser.add_plugin(GtLtPlugin())
        self.w_parser.add_plugin(RangePlugin())
        self.query = None
        self.current_node_stack = []

    def parse(self, query):

        self.query = SQ()
        self.current_node_stack = [(self.query, HAYSTACK_DEFAULT_OPERATOR)]

        wquery = self.w_parser.parse(query)

        self.visit(wquery)

        if len(self.query) == 1 and isinstance(self.query.children[0], SQ):
            return self.query.children[0]
        else:
            return self.query

    def visit(self, q):

        if isinstance(q, Term):
            current_node, current_connector = self.current_node_stack.pop()
            current_node.add(SQ(**{q.fieldname: q.text}), current_connector)
            self.current_node_stack.append((current_node, current_connector))
        elif isinstance(q, And):
            self._add_compound_query(q, SQ.AND)
        elif isinstance(q, AndMaybe):
            self._add_andmaybe(q)
        elif isinstance(q, Or):
            self._add_compound_query(q, SQ.OR)
        elif isinstance(q, AndNot):
            self._add_andnot(q)
        elif isinstance(q, Not):
            self._add_not(q)
        elif isinstance(q, Phrase):
            self._add_phrase(q)
        elif isinstance(q, Prefix):
            self._add_prefix(q)
        elif isinstance(q, TermRange):
            self._add_range(q)

    def _add_compound_query(self, q, connector):

        new_node = SQ()
        self.current_node_stack.append((new_node, connector))
        for subquery in q.subqueries:
            self.visit(subquery)
        self.current_node_stack.pop()

        if len(new_node) == 1 and isinstance(new_node.children[0], SQ):
            new_node = new_node.children[0]

        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def _add_andnot(self, q):

        new_node = SQ()
        self.current_node_stack.append((new_node, SQ.AND))
        self.visit(q.a)
        self.visit(Not(q.b))
        self.current_node_stack.pop()

        if len(new_node) == 1 and isinstance(new_node.children[0], SQ):
            new_node = new_node.children[0]

        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def _add_andmaybe(self, q):

        new_node = SQ()
        self.current_node_stack.append((new_node, SQ.AND))
        self.visit(q.a)
        self.visit(q.b)
        self.current_node_stack.pop()

        if len(new_node) == 1 and isinstance(new_node.children[0], SQ):
            new_node = new_node.children[0]

        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def _add_not(self, q):

        new_node = SQ()
        self.current_node_stack.append((new_node, SQ.AND))
        self.visit(q.query)
        self.current_node_stack.pop()

        if len(new_node) == 1 and isinstance(new_node.children[0], SQ):
            new_node = new_node.children[0]

        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(~new_node, current_connector)

    def _add_phrase(self, q):
        new_node = SQ(**{q.fieldname + "__exact": " ".join(q.words)})
        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def _add_prefix(self, q):
        new_node = SQ(**{q.fieldname + "__startswith": q.text})
        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def _add_range(self, q):

        if q.start is None:
            if q.endexcl:
                postfix = "__lt"
            else:
                postfix = "__lte"
            new_node = SQ(**{q.fieldname + postfix: self.__convert_nb(q.end)})
        elif q.end is None:
            if q.startexcl:
                postfix = "__gt"
            else:
                postfix = "__gte"
            new_node = SQ(
                **{q.fieldname + postfix: self.__convert_nb(q.start)})
        else:
            new_node = SQ(
                **{
                    q.fieldname + "__range":
                    [self.__convert_nb(q.start),
                     self.__convert_nb(q.end)]
                })

        current_node, current_connector = self.current_node_stack[-1]
        current_node.add(new_node, current_connector)

    def __convert_nb(self, str_nb):
        try:
            res = int(str_nb)
            return res
        except ValueError:
            try:
                res = float(str_nb)
                return res
            except ValueError:
                return str_nb