Exemple #1
0
def calc_rbo(pred_res_df, vars_res_df, top):
    _prdf = pred_res_df['docID', 'docScore'].head(top)
    _prdf.reset_index(drop=True, inplace=True)
    _prdf.set_index('docID', inplace=True)
    _prdict = _prdf.to_dict()['docScore']

    _vrdf = vars_res_df['docID', 'docScore'].head(top)
    _vrdf.reset_index(drop=True, inplace=True)
    _vrdf.set_index('docID', inplace=True)
    _vrdict = _vrdf.to_dict()['docScore']

    rbo_dict(_prdf, _vrdf)
Exemple #2
0
    def _calc_features(self):
        _dict = {'topic': [], 'src': [], 'dest': [], 'jac': [],
                 f'Top_{self.top_docs_overlap}_Docs_overlap': [], f'RBO_EXT_{self.rbo_top}': [],
                 f'RBO_FUSED_EXT_{self.rbo_top}': []}
        for topic, pairs in self.features_index.items():
            # number of combination with replacement is n(n+1)/2
            _dict['topic'] += [topic] * (2 * len(pairs) - len(self.query_vars[topic]))
            fused_res_dict = self.fused_data.get_res_dict_by_qid(topic, top=100)
            for q1, q2 in pairs:
                txt1 = self.queries_data.get_qid_txt(q1)
                txt2 = self.queries_data.get_qid_txt(q2)
                jc = jaccard_coefficient(txt1, txt2)

                l1 = self.raw_res_data.get_docs_by_qid(q1, self.top_docs_overlap)
                l2 = self.raw_res_data.get_docs_by_qid(q2, self.top_docs_overlap)
                docs_overlap = list_overlap(l1, l2)

                # All RBO values are rounded to 10 decimal digits, to avoid float overflow
                q1_results_dict = self.raw_res_data.get_res_dict_by_qid(q1, top=self.rbo_top)
                q2_results_dict = self.raw_res_data.get_res_dict_by_qid(q2, top=self.rbo_top)
                _rbo_scores_dict = rbo_dict(q1_results_dict, q2_results_dict, p=0.95)
                rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10)

                _q1_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q1_results_dict, p=0.95)
                _q1_rbo_fused_ext_score = np.around(_q1_fused_rbo_scores_dict['ext'], 10)

                _q2_fused_rbo_scores_dict = rbo_dict(fused_res_dict, q2_results_dict, p=0.95)
                _q2_rbo_fused_ext_score = np.around(_q2_fused_rbo_scores_dict['ext'], 10)

                def _save_to_dict(q_1, q_2):
                    _dict['src'] += [q_1]
                    _dict['dest'] += [q_2]
                    _dict['jac'] += [jc]
                    _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [docs_overlap]
                    _dict[f'RBO_{self.rbo_top}'] += [rbo_ext_score]
                    # The RBO-F feature in that case for edge (q1, q2) will be the RBO similarity of q2 to fused list
                    _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [_q2_rbo_fused_ext_score]

                if q1 == q2:
                    _save_to_dict(q1, q2)
                else:
                    _save_to_dict(q1, q2)
                    _save_to_dict(q2, q1)

        _df = pd.DataFrame.from_dict(_dict)
        _df.sort_values(['topic', 'src', 'dest'], inplace=True)
        _df.set_index(['topic', 'src', 'dest'], inplace=True)
        _test_dir = dp.ensure_dir(f'{self.res_dir}/test/pageRank/')
        _df.to_pickle(f'{_test_dir}/{self.corpus}_raw_PageRank_Features.pkl')
        return _df
Exemple #3
0
    def calc_list_features(self, overlap_size=100, rbo_size=100):
        """
        Calculates list similarity features for all the query pairs (already existing) in the self.features_df.
        The method calculates the features for the passed list sizes and returns a new df with the result
        :param overlap_size: size of the list for overlap feature
        :param rbo_size: size of the list for RBO feature
        :return: pandas DF, with the calculated features (based on the self.features_df)
        """
        features_df = self.features_df.set_index(['topic', 'q1',
                                                  'q2']).assign(overlap=None,
                                                                rbo=None)
        for topic, (q1, q2) in self.features_df.set_index(
                'topic').loc[:, ['q1', 'q2']].iterrows():
            over_sim = list_overlap(
                self.ql_results_obj.get_docs_by_qid(q1, overlap_size),
                self.ql_results_obj.get_docs_by_qid(q2, overlap_size))
            rbo_sim = rbo_dict(
                self.ql_results_obj.get_res_dict_by_qid(q1, rbo_size),
                self.ql_results_obj.get_res_dict_by_qid(q2, rbo_size))['min']

            features_df.loc[topic, q1, q2] = [over_sim, rbo_sim]
        df = features_df.rename(columns={
            'overlap': f'overlap_{overlap_size}',
            'rbo': f'rbo_{rbo_size}'
        })
        return df
Exemple #4
0
    def _calc_features(self):
        """This method calculates the similarity features for all the variations with the 'query at hand' i.e. the query
        that being predicted, including the query itself (if it's among the variations)"""

        _dict = {
            'topic': [],
            'qid': [],
            'Jac_coefficient': [],
            f'Top_{self.top_docs_overlap}_Docs_overlap': [],
            f'RBO_EXT_{self.rbo_top}': [],
            f'RBO_FUSED_EXT_{self.rbo_top}': []
        }

        for topic in self.topics_data.queries_dict.keys():
            _topic = topic.split('-')[0]
            q_vars = self.query_vars.get(_topic)
            _dict['topic'] += [topic] * len(q_vars)
            res_dict = self.fused_data.get_res_dict_by_qid(_topic,
                                                           top=self.rbo_top)
            topic_txt = self.topics_data.get_qid_txt(topic)
            topics_top_list = self.prediction_queries_res_data.get_docs_by_qid(
                topic, self.top_docs_overlap)
            # topics_top_list = self.title_res_data.get_docs_by_qid(topic, 25)
            topic_results_list = self.prediction_queries_res_data.get_res_dict_by_qid(
                topic, top=self.rbo_top)

            for var in q_vars:
                var_txt = self.queries_data.get_qid_txt(var)
                jc = jaccard_coefficient(topic_txt, var_txt)

                var_top_list = self.raw_res_data.get_docs_by_qid(
                    var, self.top_docs_overlap)
                # var_top_list = self.raw_res_data.get_docs_by_qid(var, 25)
                docs_overlap = list_overlap(topics_top_list, var_top_list)

                # All RBO values are rounded to 10 decimal digits, to avoid float overflow
                var_results_list = self.raw_res_data.get_res_dict_by_qid(
                    var, top=self.rbo_top)
                _rbo_scores_dict = rbo_dict(topic_results_list,
                                            var_results_list,
                                            p=0.95)
                rbo_ext_score = np.around(_rbo_scores_dict['ext'], 10)

                _fused_rbo_scores_dict = rbo_dict(res_dict,
                                                  var_results_list,
                                                  p=0.95)
                _rbo_fused_ext_score = np.around(_fused_rbo_scores_dict['ext'],
                                                 10)

                _dict['qid'] += [var]
                _dict['Jac_coefficient'] += [jc]
                _dict[f'Top_{self.top_docs_overlap}_Docs_overlap'] += [
                    docs_overlap
                ]
                _dict[f'RBO_EXT_{self.rbo_top}'] += [rbo_ext_score]
                _dict[f'RBO_FUSED_EXT_{self.rbo_top}'] += [
                    _rbo_fused_ext_score
                ]

        _df = pd.DataFrame.from_dict(_dict)
        # _df.set_index(['topic', 'qid'], inplace=True)
        return _df