def create_overlap_ref_queries(*queries): df = dt.QueriesTextParser(queries[0], 'uqv').queries_df for query_file in queries[1:]: _df = dt.QueriesTextParser(query_file, 'uqv').queries_df df = df.merge(_df, how='inner') print(df) return df
def __init__(self, corpus, queries_group, vars_quantile, **kwargs): self.top_docs_overlap = kwargs.get('top_docs_overlap', 10) self.rbo_top = kwargs.get('rbo_top', 100) self.corpus = corpus self.queries_group = queries_group graphs = kwargs.get('graphs', None) if graphs: n = kwargs.get('n', None) assert n, 'Missing number of vars' self.__set_graph_paths(corpus, queries_group, graphs, n) else: self.__set_paths(corpus, queries_group, vars_quantile) _raw_res_data = dp.ResultsReader(self.results_file, 'trec') if queries_group == 'title': _title_res_data = dp.ResultsReader(self.title_res_file, 'trec') self.prediction_queries_res_data = _title_res_data else: self.prediction_queries_res_data = _raw_res_data self.queries_data = dp.QueriesTextParser(self.queries_full_file, 'uqv') self.topics_data = dp.QueriesTextParser(self.queries_topic_file) # Uncomment the next lines if you want to write the basic results of the topic queries. # write_basic_results(self.prediction_queries_res_data.data_df.loc[self.topics_data.queries_df['qid']], corpus, # queries_group) # exit() # These 2 DF used for the filtering method self.variations_data = dp.QueriesTextParser( self.queries_variations_file, 'uqv') self.quantile_variations_data = dp.QueriesTextParser( self.queries_quantile_vars, 'uqv') # _var_scores_df.loc[_var_scores_df['qid'].isin(_vars_list)] self.raw_res_data = _raw_res_data self.fused_data = dp.ResultsReader(self.fused_results_file, 'trec') self.query_vars = self.queries_data.query_vars
def queries_to_predict(full_queries_file, predict_queries): """This function will return a list of ID's of the queries in the UQV results, the list might be empty It matches the queries by text""" qdb_f = dp.QueriesTextParser(full_queries_file, 'uqv') fqdf = qdb_f.queries_df qdb_p = dp.QueriesTextParser(predict_queries) pqdf = qdb_p.queries_df return fqdf.loc[fqdf['text'].isin(pqdf['text'])]['qid']
def main(args): corpus = args.corpus # corpus = 'ROBUST' if not corpus: return queries_file = dp.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') rm_probabilities_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/RMprob') # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt') # rm_probabilities_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/RMprob') queries_obj = dp.QueriesTextParser(queries_file) rm_probabilities_df = dp.read_rm_prob_files(rm_probabilities_dir, number_of_docs=20000, clipping='*') uqv = True if 'uqv' in queries_file.split('/')[-1].lower() else False results_df = geo_mean(queries_obj, rm_probabilities_df) write_predictions(results_df, corpus, uqv)
def __init__(self, corpus, predictor, similarity_features_df: pd.DataFrame, test_queries='top'): self.corpus = corpus self.predictor = predictor self.test_queries = test_queries self.similarity_features_df = similarity_features_df self.__set_paths() self.test_queries_obj = dp.QueriesTextParser(self.test_queries_file, kind='uqv')
def __init__(self, predictor, corpus, qgroup, vars_quantile, **kwargs): graphs = kwargs.get('graphs', None) if graphs: n = kwargs.get('n', None) assert n, 'Missing number of vars' self.__set_graph_paths(corpus, predictor, qgroup, graphs, n) else: self.__set_paths(corpus, predictor, qgroup, vars_quantile) _q2p_obj = dp.QueriesTextParser(self.queries2predict_file, 'uqv') self.var_cv = InterTopicCrossValidation( folds_map_file=self.folds, predictions_dir=self.vars_results_dir) _vars_results_df = self.var_cv.full_set # Initialize the base prediction results of the queries to be predicted if qgroup == 'title': _base_cv = InterTopicCrossValidation( folds_map_file=self.folds, predictions_dir=self.base_results_dir) self.base_results_df = _base_cv.full_set else: self.base_results_df = dp.convert_vid_to_qid( _vars_results_df.loc[_q2p_obj.queries_dict.keys()]) self.base_results_df.rename_axis('topic', inplace=True) # The next function is used to save results in basic predictions format of the given queries set # write_basic_predictions(self.base_results_df, corpus, qgroup, predictor) self.query_vars = dp.QueriesTextParser(self.query_vars_file, 'uqv') _quantile_vars = dp.QueriesTextParser(self.quantile_vars_file, 'uqv') _features_df = features_loader(self.features, corpus) self.features_df = self.__initialize_features_df( _quantile_vars, _features_df) self.var_scores_df = self.__initialize_var_scores_df( _features_df.reset_index()[['topic', 'qid']], _vars_results_df) self.geo_mean_df = self.__initialize_geo_scores_df( _features_df.reset_index()[['topic', 'qid']], dp.ResultsReader(self.geo_mean_file, 'predictions').data_df) self.real_ap_df = self.__initialize_var_scores_df( _features_df.reset_index()[['topic', 'qid']], dp.ResultsReader(self.real_ap_file, 'ap').data_df) self.geo_as_predictor()
def __init__(self, corpus): self.corpus = corpus # self.predictor = predictor # self.ql_results_file = None # self.queries_txt_file = None # self.predictions_output_dir = None # self.pkl_dir = None self.__set_paths() self.queries_obj = dp.QueriesTextParser(self.queries_txt_file, kind='uqv') self.queries_obj.queries_df = dp.add_topic_to_qdf( self.queries_obj.queries_df).set_index('qid') self.features_df = self.initialize_features_df() self.ql_results_obj = dp.ResultsReader(self.ql_results_file, 'trec')
def __init__(self, corpus, max_n=20, corr_measure='pearson', load_from_pkl=True, queries_group='title'): self.group = queries_group self.corr_measure = corr_measure self.load_from_pkl = load_from_pkl self.__set_paths(corpus, queries_group) self.corpus = corpus self.queries_obj = dp.QueriesTextParser(self.queries_file) self.queries_obj.queries_df = add_topic_to_qdf( self.queries_obj.queries_df) self.raw_ap_obj = dp.ResultsReader(self.raw_ap_file, 'ap') self.max_n = min( self.queries_obj.queries_df.groupby('topic').count().max()['qid'], max_n) self.basic_results_dict = defaultdict(float) self.__initialize_basic_results_dict()
def add_original_queries(uqv_obj: dt.QueriesTextParser): """Don't use this function ! not tested""" original_obj = dt.QueriesTextParser('QppUqvProj/data/ROBUST/queries.txt') uqv_df = uqv_obj.queries_df.set_index('qid') original_df = original_obj.queries_df.set_index('qid') for topic, vars in uqv_obj.query_vars.items(): uqv_df.loc[vars, 'topic'] = topic missing_list = [] for topic, topic_df in uqv_df.groupby('topic'): if original_df.loc[original_df['text'].isin(topic_df['text'])].empty: missing_list.append(topic) missing_df = pd.DataFrame( { 'qid': '341-9-1', 'text': original_obj.queries_dict['341'], 'topic': '341' }, index=[0]) uqv_df = uqv_df.append(missing_df.set_index('qid')) return uqv_df.sort_index().drop(columns='topic').reset_index()
def main(args): queries_txt_file = args.queries queries_to_remove = args.remove ap_file = args.ap queries_group = args.group quant_variants = args.quant stats = args.stats plot_vars = args.plot_vars filter_functions_dict = { 'top': filter_top_queries, 'low': filter_low_queries, 'medl': filter_medl_queries, 'medh': filter_medh_queries } # quantiles_dict = {'low': [0, 0.33], 'med': [0.33, 0.66], 'top': [0.66, 1]} quantiles_dict = {'low': [0, 0.5], 'high': [0.5, 1]} # # Uncomment for Debugging !!!!! # print('\n\n\n----------!!!!!!!!!!!!--------- Debugging Mode ----------!!!!!!!!!!!!---------\n\n\n') # # quant_variants = 'low' # corpus = 'ClueWeb12B' # corpus = 'ROBUST' # ap_file = dt.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') # queries_txt_file = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') # queries_txt_file_wo_title = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_title.txt') # queries_txt_file_wo_top = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_top.txt') # queries_txt_file_wo_low = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_low.txt') # queries_txt_file_wo_med = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_medh.txt') # plot_vars = True # df = create_overlap_ref_queries(queries_txt_file_wo_top, queries_txt_file_wo_low, queries_txt_file_wo_med, # queries_txt_file_wo_title) # write_queries_to_files(df, corpus, 'cref') # exit() corpus = 'ROBUST' if 'ROBUST' in queries_txt_file else 'ClueWeb12B' if queries_txt_file: qdb = dt.QueriesTextParser(queries_txt_file, 'uqv') df = add_topic_to_qdf(qdb.queries_df) qdb.queries_df = remove_duplicates(qdb) if queries_to_remove: qdb_rm = dt.QueriesTextParser(queries_to_remove) qdb.queries_df = remove_q1_from_q2(qdb_rm.queries_df, qdb) if ap_file: apdb = dt.ResultsReader(ap_file, 'ap') if queries_group != 'title': qdb.queries_df = filter_functions_dict[queries_group]( qdb.queries_df, apdb) elif quant_variants: qdb.queries_df = filter_quant_variants( qdb.queries_df, apdb, quantiles_dict[quant_variants]) if stats: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') calc_statistics(qdb.queries_df, apdb, title_queries_df, title_ap, filter_functions_dict, quantiles_dict, corpus) return elif plot_vars: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') plot_variants_ap(qdb.queries_df, apdb, title_queries_df, title_ap, corpus) return print_top_differences(qdb.queries_df, apdb, corpus)