def calc_rbo(pred_res_df, vars_res_df, top): _prdf = pred_res_df['docID', 'docScore'].head(top) _prdf.reset_index(drop=True, inplace=True) _prdf.set_index('docID', inplace=True) _prdict = _prdf.to_dict()['docScore'] _vrdf = vars_res_df['docID', 'docScore'].head(top) _vrdf.reset_index(drop=True, inplace=True) _vrdf.set_index('docID', inplace=True) _vrdict = _vrdf.to_dict()['docScore'] rbo_dict(_prdf, _vrdf)
def _calc_features(self): _dict = {'topic': [], 'src': [], 'dest': [], 'jac': [], f'Top_{self.top_docs_overlap}_Docs_overlap': [], f'RBO_EXT_{self.rbo_top}': [], f'RBO_FUSED_EXT_{self.rbo_top}': []} for topic, pairs in self.features_index.items(): # number of combination with replacement is n(n+1)/2 _dict['topic'] += [topic] * (2 * len(pairs) - len(self.query_vars[topic])) fused_res_dict = self.fused_data.get_res_dict_by_qid(topic, top=100) for q1, q2 in pairs: txt1 = self.queries_data.get_qid_txt(q1) txt2 = self.queries_data.get_qid_txt(q2) jc = jaccard_coefficient(txt1, txt2) l1 = self.raw_res_data.get_docs_by_qid(q1, self.top_docs_overlap) l2 = self.raw_res_data.get_docs_by_qid(q2, self.top_docs_overlap) docs_overlap = list_overlap(l1, l2) # All RBO values are rounded to 10 decimal digits, to avoid float overflow q1_results_dict = self.raw_res_data.get_res_dict_by_qid(q1, top=self.rbo_top) q2_results_dict = self.raw_res_data.get_res_dict_by_qid(q2, top=self.rbo_top) _rbo_scores_dict = rbo_dict(q1_results_dict, q2_results_dict, p=0.95) rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10) _q1_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q1_results_dict, p=0.95) _q1_rbo_fused_ext_score = np.around(_q1_fused_rbo_scores_dict['ext'], 10) _q2_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q2_results_dict, p=0.95) _q2_rbo_fused_ext_score = np.around(_q2_fused_rbo_scores_dict['ext'], 10) def _save_to_dict(q_1, q_2): _dict['src'] += [q_1] _dict['dest'] += [q_2] _dict['jac'] += [jc] _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [docs_overlap] _dict[f'RBO_{self.rbo_top}'] += [rbo_ext_score] # The RBO-F feature in that case for edge (q1, q2) will be the RBO similarity of q2 to fused list _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [_q2_rbo_fused_ext_score] if q1 == q2: _save_to_dict(q1, q2) else: _save_to_dict(q1, q2) _save_to_dict(q2, q1) _df = pd.DataFrame.from_dict(_dict) _df.sort_values(['topic', 'src', 'dest'], inplace=True) _df.set_index(['topic', 'src', 'dest'], inplace=True) _test_dir = dp.ensure_dir(f'{self.res_dir}/test/pageRank/') _df.to_pickle(f'{_test_dir}/{self.corpus}_raw_PageRank_Features.pkl') return _df
def calc_list_features(self, overlap_size=100, rbo_size=100): """ Calculates list similarity features for all the query pairs (already existing) in the self.features_df. The method calculates the features for the passed list sizes and returns a new df with the result :param overlap_size: size of the list for overlap feature :param rbo_size: size of the list for RBO feature :return: pandas DF, with the calculated features (based on the self.features_df) """ features_df = self.features_df.set_index(['topic', 'q1', 'q2']).assign(overlap=None, rbo=None) for topic, (q1, q2) in self.features_df.set_index( 'topic').loc[:, ['q1', 'q2']].iterrows(): over_sim = list_overlap( self.ql_results_obj.get_docs_by_qid(q1, overlap_size), self.ql_results_obj.get_docs_by_qid(q2, overlap_size)) rbo_sim = rbo_dict( self.ql_results_obj.get_res_dict_by_qid(q1, rbo_size), self.ql_results_obj.get_res_dict_by_qid(q2, rbo_size))['min'] features_df.loc[topic, q1, q2] = [over_sim, rbo_sim] df = features_df.rename(columns={ 'overlap': f'overlap_{overlap_size}', 'rbo': f'rbo_{rbo_size}' }) return df
def _calc_features(self): """This method calculates the similarity features for all the variations with the 'query at hand' i.e. the query that being predicted, including the query itself (if it's among the variations)""" _dict = { 'topic': [], 'qid': [], 'Jac_coefficient': [], f'Top_{self.top_docs_overlap}_Docs_overlap': [], f'RBO_EXT_{self.rbo_top}': [], f'RBO_FUSED_EXT_{self.rbo_top}': [] } for topic in self.topics_data.queries_dict.keys(): _topic = topic.split('-')[0] q_vars = self.query_vars.get(_topic) _dict['topic'] += [topic] * len(q_vars) res_dict = self.fused_data.get_res_dict_by_qid(_topic, top=self.rbo_top) topic_txt = self.topics_data.get_qid_txt(topic) topics_top_list = self.prediction_queries_res_data.get_docs_by_qid( topic, self.top_docs_overlap) # topics_top_list = self.title_res_data.get_docs_by_qid(topic, 25) topic_results_list = self.prediction_queries_res_data.get_res_dict_by_qid( topic, top=self.rbo_top) for var in q_vars: var_txt = self.queries_data.get_qid_txt(var) jc = jaccard_coefficient(topic_txt, var_txt) var_top_list = self.raw_res_data.get_docs_by_qid( var, self.top_docs_overlap) # var_top_list = self.raw_res_data.get_docs_by_qid(var, 25) docs_overlap = list_overlap(topics_top_list, var_top_list) # All RBO values are rounded to 10 decimal digits, to avoid float overflow var_results_list = self.raw_res_data.get_res_dict_by_qid( var, top=self.rbo_top) _rbo_scores_dict = rbo_dict(topic_results_list, var_results_list, p=0.95) rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10) _fused_rbo_scores_dict = rbo_dict(res_dict, var_results_list, p=0.95) _rbo_fused_ext_score = np.around(_fused_rbo_scores_dict['ext'], 10) _dict['qid'] += [var] _dict['Jac_coefficient'] += [jc] _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [ docs_overlap ] _dict[f'RBO_EXT_{self.rbo_top}'] += [rbo_ext_score] _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [ _rbo_fused_ext_score ] _df = pd.DataFrame.from_dict(_dict) # _df.set_index(['topic', 'qid'], inplace=True) return _df