class EarlyFusionScorer(FusionScorer): def __init__(self, index_name, association_file, object_length_file, assoc_mode, retr_params, field="content", run_id="fusion"): """ :param index_name: name of index :param association_file: document-object association file :param object_length_file: object length file :param assoc_mode: document-object weight mode, uniform or binary :param retr_params: BM25 parameter dict :param field: field to be searched """ self._index_name = index_name self._elastic = Elastic(self._index_name) self._k1 = retr_params.get("k1", 1.2) self._b = retr_params.get("b", 0.75) self._field = field self._o_l = object_length(object_length_file) self._collection_length = self._elastic.coll_length(self._field) self._N = self._elastic.num_docs() self._assoc_mode = assoc_mode self.association_file = association_file self.assoc_doc = {} self.assoc_obj = {} self.run_id = run_id def score_query(self, query): """ Scores a given query. :param query: query to be searched :return: pqo """ aquery = self._elastic.analyze_query(query) pr = self._elastic.search(aquery, self._field) avg_ol = self._collection_length / (len(self.assoc_obj)) q = self.parse(aquery) # Scoring objects, i.e., computing P(q|o) pqo = {} qt = Counter(q) for t, ftq in qt.items(): # Scores each query term and sums up, i.e., computing P(t|o) # Retrieving documents and gets IDF n = len(self._elastic.search( t, self._field)) # number of documents containing term t if n == 0: continue idf = math.log((self._N - n + 0.5) / (n + 0.5)) # Fuses f(t,o) for each object term = stemmer.stemWords(t.split())[0] ftd_fused = {} for item in pr.keys(): doc_id = item if doc_id in self.assoc_doc: try: ftd = self._elastic.term_freq(doc_id, term, self._field) except: # doc without content ftd = 0 for object_id in self.assoc_doc[doc_id]: if self._assoc_mode == FusionScorer.ASSOC_MODE_BINARY: w_do = 1 elif self._assoc_mode == FusionScorer.ASSOC_MODE_UNIFORM: w_do = 1 / len(self.assoc_obj[object_id]) else: w_do = 0 # this should never happen ftd_fused[object_id] = ftd_fused.get(object_id, 0) + ftd * w_do # Add pto into pqo for object_id in self.assoc_obj.keys(): ol = int(self._o_l[object_id]) fftd = ftd_fused.get(object_id, 0) score = (fftd * (self._k1 + 1)) / ( fftd + self._k1 * (1 - self._b + self._b * ol / avg_ol)) pqo[object_id] = pqo.get(object_id, 0) + idf * score return RetrievalResults(pqo)
class EarlyFusionScorer(FusionScorer): def __init__(self, index_name, association_file, assoc_mode, retr_params, run_id="fusion", field="content", num=100): """ :param index_name: name of index :param association_file: document-object association file :param assoc_mode: document-object weight mode, uniform or binary :param lambda: smoothing parameter :param field: field to be searched """ self._index_name = index_name self._elastic = Elastic(self._index_name) self._lambda = retr_params.get("lambda", 0.1) self._field = field self._collection_length = self._elastic.coll_length(self._field) self._assoc_mode = assoc_mode self._num = num self.association_file = association_file self.assoc_doc = {} self.assoc_obj = {} self.run_id = run_id def score_query(self, query): """ Scores a given query. :param query: query to be searched :return: pqo """ # retrieving documents aquery = self._elastic.analyze_query(query) pr = self._elastic.search(aquery, self._field, num=self._num) q = self.parse(aquery) # scoring objects, i.e., computing P(q|o) pqo = {} qt = Counter(q) for t, ftq in qt.items(): # Scores each query term and sums up, i.e., computing P(t|o) # Gets term frequency in collections term = stemmer.stemWords(t.split())[0] try: ftc = self._elastic.coll_term_freq(term, self._field) if ftc == None: print("Ignore term", t) continue except: print("Ignore term", t) continue ptc = ftc / self._collection_length # Fuses ptd for each object ptd_fused = {} for item in pr.keys(): doc_id = item if doc_id in self.assoc_doc: try: ftd = self._elastic.term_freq(doc_id, term, self._field) except: # the content of doc is empty ftd = 0 doc_length = self._elastic.doc_length(doc_id, self._field) ptd = ftd / doc_length for object_id in self.assoc_doc[doc_id]: if self._assoc_mode == FusionScorer.ASSOC_MODE_BINARY: w_do = 1 elif self._assoc_mode == FusionScorer.ASSOC_MODE_UNIFORM: w_do = 1 / len(self.assoc_obj[object_id]) else: w_do = 0 # this should never happen ptd_fused[object_id] = ptd_fused.get(object_id, 0) + ptd * w_do # Adds pto to pqo for object_id in self.assoc_obj.keys(): fptd = ptd_fused.get(object_id, 0) pto = math.log((1 - self._lambda) * fptd + self._lambda * ptc) * ftq pqo[object_id] = pqo.get(object_id, 0) + pto return RetrievalResults(pqo)