Example #1
0
    def __init__(self, corpus, queries_group, vars_quantile, **kwargs):
        self.top_docs_overlap = kwargs.get('top_docs_overlap', 10)
        self.rbo_top = kwargs.get('rbo_top', 100)
        self.corpus = corpus
        self.queries_group = queries_group
        graphs = kwargs.get('graphs', None)
        if graphs:
            n = kwargs.get('n', None)
            assert n, 'Missing number of vars'
            self.__set_graph_paths(corpus, queries_group, graphs, n)
        else:
            self.__set_paths(corpus, queries_group, vars_quantile)
        _raw_res_data = dp.ResultsReader(self.results_file, 'trec')
        if queries_group == 'title':
            _title_res_data = dp.ResultsReader(self.title_res_file, 'trec')
            self.prediction_queries_res_data = _title_res_data
        else:
            self.prediction_queries_res_data = _raw_res_data
        self.queries_data = dp.QueriesTextParser(self.queries_full_file, 'uqv')
        self.topics_data = dp.QueriesTextParser(self.queries_topic_file)
        # Uncomment the next lines if you want to write the basic results of the topic queries.
        # write_basic_results(self.prediction_queries_res_data.data_df.loc[self.topics_data.queries_df['qid']], corpus,
        #                     queries_group)
        # exit()
        # These 2 DF used for the filtering method
        self.variations_data = dp.QueriesTextParser(
            self.queries_variations_file, 'uqv')
        self.quantile_variations_data = dp.QueriesTextParser(
            self.queries_quantile_vars, 'uqv')
        # _var_scores_df.loc[_var_scores_df['qid'].isin(_vars_list)]
        self.raw_res_data = _raw_res_data

        self.fused_data = dp.ResultsReader(self.fused_results_file, 'trec')
        self.query_vars = self.queries_data.query_vars
Example #2
0
def main(args):
    results_file = args.results
    corpus_scores_file = args.corpus_scores
    queries_file = args.queries
    number_of_docs = args.docs

    # corpus = 'ROBUST'

    # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    # results_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QL.res')
    # corpus_scores_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/logqlc.res')

    # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt')
    # results_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/basic/QL.res')
    # corpus_scores_file = dp.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/basic/logqlc.res')

    queries_obj = dp.QueriesXMLParser(queries_file)
    # queries_obj = dp.QueriesTextParser(queries_file)
    results_obj = dp.ResultsReader(results_file, 'trec')
    corpus_scores_obj = dp.ResultsReader(corpus_scores_file, 'predictions')

    predictor = SMV(queries_obj, results_obj, corpus_scores_obj)
    if number_of_docs:
        predictor.calc_results(number_of_docs)
    else:
        for n in NUMBER_OF_DOCS:
            predictor.calc_results(n)
Example #3
0
def main(args):
    results_file = args.results
    predict_queries_file = args.queries_to_predict
    full_queries_file = args.full_queries_file

    results_obj = dp.ResultsReader(results_file, 'trec')
    res_df = results_obj.data_df
    q2p = queries_to_predict(full_queries_file, predict_queries_file)
    pred_res_df, vars_res_df = split_prediction_queries(res_df, q2p)
Example #4
0
    def __init__(self, predictor, corpus, qgroup, vars_quantile, **kwargs):
        graphs = kwargs.get('graphs', None)
        if graphs:
            n = kwargs.get('n', None)
            assert n, 'Missing number of vars'
            self.__set_graph_paths(corpus, predictor, qgroup, graphs, n)
        else:
            self.__set_paths(corpus, predictor, qgroup, vars_quantile)
        _q2p_obj = dp.QueriesTextParser(self.queries2predict_file, 'uqv')
        self.var_cv = InterTopicCrossValidation(
            folds_map_file=self.folds, predictions_dir=self.vars_results_dir)
        _vars_results_df = self.var_cv.full_set
        # Initialize the base prediction results of the queries to be predicted
        if qgroup == 'title':
            _base_cv = InterTopicCrossValidation(
                folds_map_file=self.folds,
                predictions_dir=self.base_results_dir)
            self.base_results_df = _base_cv.full_set
        else:
            self.base_results_df = dp.convert_vid_to_qid(
                _vars_results_df.loc[_q2p_obj.queries_dict.keys()])

        self.base_results_df.rename_axis('topic', inplace=True)
        # The next function is used to save results in basic predictions format of the given queries set
        # write_basic_predictions(self.base_results_df, corpus, qgroup, predictor)
        self.query_vars = dp.QueriesTextParser(self.query_vars_file, 'uqv')
        _quantile_vars = dp.QueriesTextParser(self.quantile_vars_file, 'uqv')
        _features_df = features_loader(self.features, corpus)
        self.features_df = self.__initialize_features_df(
            _quantile_vars, _features_df)
        self.var_scores_df = self.__initialize_var_scores_df(
            _features_df.reset_index()[['topic', 'qid']], _vars_results_df)
        self.geo_mean_df = self.__initialize_geo_scores_df(
            _features_df.reset_index()[['topic', 'qid']],
            dp.ResultsReader(self.geo_mean_file, 'predictions').data_df)
        self.real_ap_df = self.__initialize_var_scores_df(
            _features_df.reset_index()[['topic', 'qid']],
            dp.ResultsReader(self.real_ap_file, 'ap').data_df)
        self.geo_as_predictor()
Example #5
0
 def __init__(self, qpp_ref: QueryPredictionRef, corr_measure='pearson'):
     self.corr_measure = corr_measure
     _predictor = qpp_ref.predictor
     self.features_df = qpp_ref.features_df
     self.results_df = qpp_ref.var_scores_df
     _ap_file = qpp_ref.ap_file
     self.ap_obj = dp.ResultsReader(_ap_file, 'ap')
     self.folds_df = qpp_ref.var_cv.data_sets_map.transpose()
     self.output_dir = f'{qpp_ref.output_dir}/ltr/{_predictor}/'
     dp.ensure_dir(self.output_dir)
     self.calc_features_df = qpp_ref.calc_integrated
     self.feature_names = self.features_df.columns.tolist()
     self.cpu_cores = mp.cpu_count() - 1
Example #6
0
 def __init__(self, corpus):
     self.corpus = corpus
     # self.predictor = predictor
     # self.ql_results_file = None
     # self.queries_txt_file = None
     # self.predictions_output_dir = None
     # self.pkl_dir = None
     self.__set_paths()
     self.queries_obj = dp.QueriesTextParser(self.queries_txt_file,
                                             kind='uqv')
     self.queries_obj.queries_df = dp.add_topic_to_qdf(
         self.queries_obj.queries_df).set_index('qid')
     self.features_df = self.initialize_features_df()
     self.ql_results_obj = dp.ResultsReader(self.ql_results_file, 'trec')
Example #7
0
 def _build_full_set(predictions_dir, ap_file=None):
     """Assuming the predictions files are named : predictions-[*]"""
     all_files = glob.glob(predictions_dir + "/*predictions*")
     if 'uef' in predictions_dir:
         # Excluding all the 5 and 10 docs predictions
         if 'qf' in predictions_dir:
             all_files = [fn for fn in all_files if
                          not os.path.basename(fn).endswith('-5+', 11, 14) and not os.path.basename(fn).endswith(
                              '-10+', 11, 15)]
         else:
             all_files = [fn for fn in all_files if
                          not os.path.basename(fn).endswith('-5') and not os.path.basename(fn).endswith('-10')]
     list_ = []
     for file_ in all_files:
         fname = file_.split('-')[-1]
         df = dp.ResultsReader(file_, 'predictions').data_df
         df = df.rename(columns={"score": f'score_{fname}'})
         list_.append(df)
     if ap_file:
         ap_df = dp.ResultsReader(ap_file, 'ap').data_df
         list_.append(ap_df)
     full_set = pd.concat(list_, axis=1, sort=True)
     assert not full_set.empty, f'The Full set DF is empty, make sure that {predictions_dir} is not empty'
     return full_set
Example #8
0
 def __init__(self,
              corpus,
              max_n=20,
              corr_measure='pearson',
              load_from_pkl=True,
              queries_group='title'):
     self.group = queries_group
     self.corr_measure = corr_measure
     self.load_from_pkl = load_from_pkl
     self.__set_paths(corpus, queries_group)
     self.corpus = corpus
     self.queries_obj = dp.QueriesTextParser(self.queries_file)
     self.queries_obj.queries_df = add_topic_to_qdf(
         self.queries_obj.queries_df)
     self.raw_ap_obj = dp.ResultsReader(self.raw_ap_file, 'ap')
     self.max_n = min(
         self.queries_obj.queries_df.groupby('topic').count().max()['qid'],
         max_n)
     self.basic_results_dict = defaultdict(float)
     self.__initialize_basic_results_dict()
Example #9
0
def main(args):
    queries_txt_file = args.queries
    queries_to_remove = args.remove
    ap_file = args.ap
    queries_group = args.group
    quant_variants = args.quant
    stats = args.stats
    plot_vars = args.plot_vars

    filter_functions_dict = {
        'top': filter_top_queries,
        'low': filter_low_queries,
        'medl': filter_medl_queries,
        'medh': filter_medh_queries
    }
    # quantiles_dict = {'low': [0, 0.33], 'med': [0.33, 0.66], 'top': [0.66, 1]}
    quantiles_dict = {'low': [0, 0.5], 'high': [0.5, 1]}

    # # Uncomment for Debugging !!!!!
    # print('\n\n\n----------!!!!!!!!!!!!--------- Debugging Mode ----------!!!!!!!!!!!!---------\n\n\n')
    # # quant_variants = 'low'
    # corpus = 'ClueWeb12B'
    # corpus = 'ROBUST'
    # ap_file = dt.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')
    # queries_txt_file = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    # queries_txt_file_wo_title = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_title.txt')
    # queries_txt_file_wo_top = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_top.txt')
    # queries_txt_file_wo_low = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_low.txt')
    # queries_txt_file_wo_med = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_medh.txt')
    # plot_vars = True

    # df = create_overlap_ref_queries(queries_txt_file_wo_top, queries_txt_file_wo_low, queries_txt_file_wo_med,
    #                                 queries_txt_file_wo_title)
    # write_queries_to_files(df, corpus, 'cref')
    # exit()

    corpus = 'ROBUST' if 'ROBUST' in queries_txt_file else 'ClueWeb12B'
    if queries_txt_file:
        qdb = dt.QueriesTextParser(queries_txt_file, 'uqv')
        df = add_topic_to_qdf(qdb.queries_df)
        qdb.queries_df = remove_duplicates(qdb)
        if queries_to_remove:
            qdb_rm = dt.QueriesTextParser(queries_to_remove)
            qdb.queries_df = remove_q1_from_q2(qdb_rm.queries_df, qdb)
        if ap_file:
            apdb = dt.ResultsReader(ap_file, 'ap')
            if queries_group != 'title':
                qdb.queries_df = filter_functions_dict[queries_group](
                    qdb.queries_df, apdb)
            elif quant_variants:
                qdb.queries_df = filter_quant_variants(
                    qdb.queries_df, apdb, quantiles_dict[quant_variants])
            if stats:
                title_queries_file = dt.ensure_file(
                    f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt')
                title_queries_df = dt.QueriesTextParser(
                    title_queries_file).queries_df
                title_ap_file = dt.ensure_file(
                    f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000')
                title_ap = dt.ResultsReader(title_ap_file, 'ap')
                calc_statistics(qdb.queries_df, apdb, title_queries_df,
                                title_ap, filter_functions_dict,
                                quantiles_dict, corpus)
                return
            elif plot_vars:
                title_queries_file = dt.ensure_file(
                    f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt')
                title_queries_df = dt.QueriesTextParser(
                    title_queries_file).queries_df
                title_ap_file = dt.ensure_file(
                    f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000')
                title_ap = dt.ResultsReader(title_ap_file, 'ap')
                plot_variants_ap(qdb.queries_df, apdb, title_queries_df,
                                 title_ap, corpus)
                return

        print_top_differences(qdb.queries_df, apdb, corpus)