def load_full_features_df(**kwargs): """ :param kwargs: corpus, queries_group, quantile or features_factory_obj: QueryFeatureFactory() object :return: pd.DataFrame that contains all the features values """ corpus = kwargs.get('corpus', None) queries_group = kwargs.get('queries_group', None) quantile = kwargs.get('quantile', None) features_factory_obj = kwargs.get('features_factory_obj', None) if features_factory_obj: features_obj = features_factory_obj corpus = features_obj.corpus queries_group = features_obj.queries_group else: assert corpus and queries_group and quantile, f"Can't create a factory object from Corpus={corpus}, " \ f"Queries group={queries_group}, Variations Quantile={quantile}" features_obj = RefQueryFeatureFactory(corpus, queries_group, quantile) pkl_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/test/ref/pkl_files/') _list = [] last_df = pd.DataFrame() for n in NUMBER_OF_DOCS: _file = f'{pkl_dir}/{queries_group}_queries_{corpus}_RBO_{n}_TopDocs_{n}.pkl' try: dp.ensure_file(_file) _df = pd.read_pickle(_file).set_index(['topic', 'qid']) _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n _list.append(_df.drop('Jac_coefficient', axis=1)) last_df = _df['Jac_coefficient'] except AssertionError: print(f'!! Warning !! The file {_file} is missing') df = pd.concat(_list + [last_df], axis=1) return features_obj.divide_by_size(df)
def check_significance(corpus, predictor, alpha=0.05): _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/aggregated/avg/' baseline_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/evaluation/') baseline_file = dp.ensure_file( f'{baseline_dir}/simple_results_vector_for_2_folds_30_repetitions_avg.json' ) with open(baseline_file) as json_data: data = json.load(json_data) baseline_sr = pd.DataFrame.from_dict(data, orient='index', columns=['correlation'], dtype=float) candidate_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/ltr/evaluation/') candidate_file = dp.ensure_file( f'{candidate_dir}/simple_results_vector_for_2_folds_30_repetitions_ltr.json' ) with open(candidate_file) as json_data: data = json.load(json_data) candidate_sr = pd.DataFrame.from_dict(data, orient='index', columns=['correlation'], dtype=float) print(f'baseline: {baseline_sr.mean()[0]:.3f}') return t_test(baseline_sr, candidate_sr, alpha)
def __set_paths(self): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{self.corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{self.corpus}') self.ql_results_file = dp.ensure_file( f'{_corpus_res_dir}/test/raw/QL.res') self.queries_txt_file = dp.ensure_file( f'{_corpus_dat_dir}/queries_{self.corpus}_UQV_full.stemmed.txt') # self.predictions_dir = dp.ensure_dir(f'{_corpus_res_dir}/uqvPredictions/raw/{self.predictor}') self.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/pkl_files/')
def __set_paths(cls, corpus, predictor, qgroup, vars_quantile): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" cls.predictor = predictor _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/' cls.vars_results_dir = dp.ensure_dir( f'{_base_dir}/raw/{predictor}/predictions/') if qgroup == 'title': _orig_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/title') cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/' cls.output_dir = dp.ensure_dir( f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/') _test_dir = f'~/QppUqvProj/Results/{corpus}/test' cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json') cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}') # cls.features = '{}/raw/query_features_{}_uqv_legal.JSON'.format(_test_dir, corpus) # cls.features = f'{_test_dir}/ref/{qgroup}_query_features_{corpus}_uqv.JSON' cls.features = dp.ensure_file( f'{_test_dir}/ref/{qgroup}_query_{vars_quantile}_variations_features_{corpus}_uqv.JSON' ) cls.geo_mean_file = dp.ensure_file( f'{_base_dir}/raw/geo/predictions/predictions-20000') # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _query_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{qgroup}.txt' cls.query_vars_file = os.path.normpath(os.path.expanduser(_query_vars)) dp.ensure_file(cls.query_vars_file) _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt' cls.queries2predict_file = dp.ensure_file(_queries2predict) if vars_quantile == 'all': cls.quantile_vars_file = cls.query_vars_file else: _quantile_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_{vars_quantile}_variants.txt' cls.quantile_vars_file = os.path.normpath( os.path.expanduser(_quantile_vars)) dp.ensure_file(cls.quantile_vars_file) cls.real_ap_file = dp.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') cls.geo_predictions_dir = dp.ensure_dir( f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/geo/predictions' )
def features_loader(file_to_load, corpus): if file_to_load is None: file = dp.ensure_file('features_{}_uqv.JSON'.format(corpus)) else: file = dp.ensure_file(file_to_load) features_df = pd.read_json(file, dtype={'topic': str, 'qid': str}) features_df.reset_index(drop=True, inplace=True) features_df.set_index(['topic', 'qid'], inplace=True) features_df.rename(index=lambda x: x.split('-')[0], level=0, inplace=True) features_df.sort_values(['topic', 'qid'], axis=0, inplace=True) return features_df
def generate_results_df(self, cores=None, load_from_pkl=None): # _pkl_file = f'{self.data_dir}/pkl_files/full_results_df_{self.max_n}_{self.corpus}_{self.corr_measure}_{self.group}.pkl' _pkl_file = f'{self.data_dir}/pkl_files/full_results_df_{self.max_n}_{self.corpus}_{self.corr_measure}.pkl' if load_from_pkl: try: file_to_load = dp.ensure_file(_pkl_file) full_results_df = pd.read_pickle(file_to_load) except AssertionError: print(f'\nFailed to load {_pkl_file}') print(f'Will generate {_pkl_file} and save') with mp.Pool(processes=cores) as pool: result = pool.starmap( self._calc_general_model_result, itertools.product({'asce', 'desc'}, PREDICTORS, SIMILARITY_FUNCTIONS.values())) pool.close() full_results_df = pd.concat(result, axis=0) full_results_df.to_pickle(_pkl_file) else: with mp.Pool(processes=cores) as pool: result = pool.starmap( self._calc_general_model_result, itertools.product({'asce', 'desc'}, PREDICTORS, SIMILARITY_FUNCTIONS.values())) pool.close() full_results_df = pd.concat(result, axis=0) full_results_df.to_pickle(_pkl_file) return full_results_df
def main(args): corpus = args.corpus # corpus = 'ROBUST' if not corpus: return queries_file = dp.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') rm_probabilities_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/RMprob') # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt') # rm_probabilities_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/RMprob') queries_obj = dp.QueriesTextParser(queries_file) rm_probabilities_df = dp.read_rm_prob_files(rm_probabilities_dir, number_of_docs=20000, clipping='*') uqv = True if 'uqv' in queries_file.split('/')[-1].lower() else False results_df = geo_mean(queries_obj, rm_probabilities_df) write_predictions(results_df, corpus, uqv)
def __init__(self, folds_map_file=None, k=2, rep=30, predictions_dir=None, test='pearson', ap_file=None, generate_folds=False, **kwargs): logging.debug("testing logger") self.k = k self.rep = rep self.test = test assert predictions_dir, 'Specify predictions dir' assert folds_map_file, 'Specify path for CV folds file' predictions_dir = os.path.abspath(os.path.normpath(os.path.expanduser(predictions_dir))) assert os.listdir(predictions_dir), f'{predictions_dir} is empty' self.output_dir = dp.ensure_dir(predictions_dir.replace('predictions', 'evaluation')) if ap_file: self.full_set = self._build_full_set(predictions_dir, ap_file) if '-' in ap_file: self.ap_func = ap_file.split('-')[-1] else: self.ap_func = 'basic' else: self.full_set = self._build_full_set(predictions_dir) if generate_folds: self.index = self.full_set.index self.folds_file = self._generate_k_folds() self.__load_k_folds() else: try: self.folds_file = dp.ensure_file(folds_map_file) except FileExistsError: print("The folds file specified doesn't exist, going to generate the file and save") self.__load_k_folds()
def __set_paths(cls, corpus): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" _corpus_test_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/test/') # AP file for the cross validation process cls.query_ap_file = dp.ensure_file(f'{_corpus_test_dir}/ref/QLmap1000-title') # CV folds mapping file cls.cv_map_file = dp.ensure_file(f'{_corpus_test_dir}/2_folds_30_repetitions.json') # The data dir for the Graphs cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data') # The results base dir for the Graphs cls.results_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/referenceLists/title/all_vars/general') cls.raw_res_base_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/title/all_vars/general') _ap_file = f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000' cls.true_ap_file = dp.ensure_file(_ap_file)
def load_per_topic_df(self): try: inter_res_file = dp.ensure_file( f'{self.output_dir}/per_topic_correlations_for_{self.k}_folds_{self.rep}_repetitions_pageRank.pkl') except AssertionError: logging.warning( f"File {self.output_dir}/per_topic_correlations_for_{self.k}_folds_{self.rep}_repetitions_pageRank.pkl doesnt exist") return None df = pd.read_pickle(inter_res_file) return df
def __set_graph_paths(cls, corpus, predictor, qgroup, direct, n): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" cls.predictor = predictor _corpus_res_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/uqvPredictions/') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}') _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data/{direct}') # Prediction results of all UQV query variants cls.vars_results_dir = dp.ensure_dir( f'{_corpus_res_dir}/raw/{predictor}/predictions/') # Prediction results of the queries to be predicted _orig_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}') cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/' # The directory to save the new results cls.output_dir = dp.ensure_dir( f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars') # The files for used for the LTR and CV _test_dir = f'~/QppUqvProj/Results/{corpus}/test' cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json') cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}') # The features file used for prediction cls.features = dp.ensure_file( f'{_graphs_dat_dir}/features/{qgroup}_query_{n}_variations_features_{corpus}_uqv.JSON' ) cls.geo_mean_file = dp.ensure_file( f'QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions/predictions-20000' ) # The variations file is used in the filter function - it consists of all the vars w/o the query at hand cls.query_vars_file = dp.ensure_file( f'{_graphs_dat_dir}/queries/queries_wo_{qgroup}_{n}_vars.txt') cls.quantile_vars_file = cls.query_vars_file _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt' cls.queries2predict_file = dp.ensure_file(_queries2predict) cls.real_ap_file = dp.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') cls.geo_predictions_dir = dp.ensure_dir( f'{_corpus_res_dir}/referenceLists/{qgroup}/all_vars/sim_as_pred/geo/predictions' )
def __set_paths(cls, corpus, group): _corpus_test_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/test/') # Basic predictions dir cls.basic_predictions_dir = dp.ensure_dir( f'~/QppUqvProj/Results/{corpus}/basicPredictions/{group}/') # AP file to pick variations according to AP cls.raw_ap_file = dp.ensure_file(f'{_corpus_test_dir}/raw/QLmap1000') # AP file for the cross validation process cls.query_ap_file = dp.ensure_file( f'{_corpus_test_dir}/ref/QLmap1000-{group}') # CV folds mapping file cls.cv_map_file = dp.ensure_file( f'{_corpus_test_dir}/2_folds_30_repetitions.json') # Queries file with all the variations except the ones to be predicted cls.queries_file = dp.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{group}.txt') # The data dir for the Graphs cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data') # The results base dir for the Graphs cls.results_dir = dp.ensure_dir( f'~/QppUqvProj/Graphs/{corpus}/referenceLists/{group}')
def __set_paths(cls, corpus, qgroup, vars_quantile): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" # cls.predictor = predictor _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _results_file = f'{_corpus_res_dir}/test/raw/QL.res' cls.results_file = os.path.normpath(_results_file) dp.ensure_file(cls.results_file) _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res' cls.title_res_file = os.path.normpath(_title_results_file) dp.ensure_file(cls.title_res_file) cls.queries_full_file = dp.ensure_file( f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt') # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _queries_variations_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_wo_{qgroup}.txt' cls.queries_variations_file = dp.ensure_file(_queries_variations_file) # The vars quantile file is used in the filter function - it consists of the relevant vars quantile if vars_quantile == 'all': _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.txt' else: _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_{vars_quantile}_variants.txt' cls.queries_quantile_vars = dp.ensure_file(_queries_quantile_file) _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt' cls.queries_topic_file = dp.ensure_file(_queries_topic_file) _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res' cls.fused_results_file = dp.ensure_file(_fused_results_file) # cls.output_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/') _predictions_out = f'{_corpus_res_dir}/uqvPredictions/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/' cls.predictions_output_dir = dp.ensure_dir(_predictions_out) cls.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/ref/pkl_files/')
def load_similarity_features_df(self): """ Try loading the features df from a file, if fails will generate a new one :return: pandas DF with the similarity features """ sim_features_file = f'{self.pkl_dir}/similarity_features_df.pkl' try: df_file = dp.ensure_file(sim_features_file) df = pd.read_pickle(df_file) except AssertionError: print( f'-- Failed loading {sim_features_file}, will generate and save --' ) df = self.calc_features_parallel() df.to_pickle(sim_features_file) return df
def __load_features_df(self, _file_name): """The method will try to load the features DF from a pkl file, if it fails it will generate a new df and save it""" try: # Will try loading a DF, if fails will generate and save a new one file_to_load = dp.ensure_file(_file_name) _df = pd.read_pickle(file_to_load) except AssertionError: print(f'\nFailed to load {_file_name}') print( f'Will generate {self.pkl_dir.rsplit("/")[-1]} vars {self.queries_group}_query_features ' f'features and save') _df = self._calc_features() _df.to_pickle(_file_name) n = self.top_docs_overlap _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n return _df
def __set_graph_paths(cls, corpus, qgroup, direct, n): """This method sets the default paths of the files and the working directories, it assumes the standard naming convention of the project""" # cls.predictor = predictor _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}') _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}') _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}') _graphs_res_dir = dp.ensure_dir( f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars') _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data') cls.number_of_vars = n _results_file = f'{_corpus_res_dir}/test/raw/QL.res' cls.results_file = os.path.normpath(_results_file) dp.ensure_file(cls.results_file) _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res' cls.title_res_file = os.path.normpath(_title_results_file) dp.ensure_file(cls.title_res_file) _queries_full_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt' cls.queries_full_file = dp.ensure_file(_queries_full_file) # The variations file is used in the filter function - it consists of all the vars w/o the query at hand _queries_variations_file = f'{_graphs_dat_dir}/{direct}/queries/queries_wo_{qgroup}_{n}_vars.txt' cls.queries_variations_file = dp.ensure_file(_queries_variations_file) cls.queries_quantile_vars = cls.queries_variations_file _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt' cls.queries_topic_file = dp.ensure_file(_queries_topic_file) _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res' # _fused_results_file = f'{_corpus_res_dir}/test/fusion/all_wo_{qgroup}_fused_QL.res' cls.fused_results_file = dp.ensure_file(_fused_results_file) # cls.output_dir = dp.ensure_dir(f'{_graphs_res_dir}/test/raw/') cls.predictions_output_dir = dp.ensure_dir( f'{_graphs_res_dir}/sim_as_pred/') cls.pkl_dir = dp.ensure_dir(f'{_graphs_dat_dir}/pkl_files/features')
def __initialize_basic_results_dict(self): _pkl_file = f'{self.data_dir}/pkl_files/basic_results_dict_{self.corpus}_{self.corr_measure}.pkl' if self.load_from_pkl: try: file_to_load = dp.ensure_file(_pkl_file) with open(file_to_load, 'rb') as handle: self.basic_results_dict = pickle.load(handle) except AssertionError: print(f'\nFailed to load {_pkl_file}') print(f'Will generate {_pkl_file} and save') for predictor in PREDICTORS: self.calc_single_query_result(predictor) with open(_pkl_file, 'wb') as handle: pickle.dump(self.basic_results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) else: for predictor in PREDICTORS: self.calc_single_query_result(predictor) with open(_pkl_file, 'wb') as handle: pickle.dump(self.basic_results_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def generate_results_df(self, cores=4): _pkl_file = f'{self.data_dir}/pkl_files/lambda_full_results_df_{self.corpus}_{self.corr_measure}.pkl' if self.load_from_pkl: try: file_to_load = dp.ensure_file(_pkl_file) full_results_df = pd.read_pickle(file_to_load) except AssertionError: print(f'\nFailed to load {_pkl_file}') print(f'Will generate {_pkl_file} and save') with mp.Pool(processes=cores) as pool: result = pool.starmap(self.generate_graph_df, itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS)) pool.close() full_results_df = pd.concat(result, axis=0) full_results_df.to_pickle(_pkl_file) else: with mp.Pool(processes=cores) as pool: result = pool.starmap(self.generate_graph_df, itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS)) pool.close() full_results_df = pd.concat(result, axis=0) full_results_df.to_pickle(_pkl_file) return full_results_df
def main(args): queries_txt_file = args.queries queries_to_remove = args.remove ap_file = args.ap queries_group = args.group quant_variants = args.quant stats = args.stats plot_vars = args.plot_vars filter_functions_dict = { 'top': filter_top_queries, 'low': filter_low_queries, 'medl': filter_medl_queries, 'medh': filter_medh_queries } # quantiles_dict = {'low': [0, 0.33], 'med': [0.33, 0.66], 'top': [0.66, 1]} quantiles_dict = {'low': [0, 0.5], 'high': [0.5, 1]} # # Uncomment for Debugging !!!!! # print('\n\n\n----------!!!!!!!!!!!!--------- Debugging Mode ----------!!!!!!!!!!!!---------\n\n\n') # # quant_variants = 'low' # corpus = 'ClueWeb12B' # corpus = 'ROBUST' # ap_file = dt.ensure_file(f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000') # queries_txt_file = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt') # queries_txt_file_wo_title = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_title.txt') # queries_txt_file_wo_top = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_top.txt') # queries_txt_file_wo_low = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_low.txt') # queries_txt_file_wo_med = dt.ensure_file(f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_medh.txt') # plot_vars = True # df = create_overlap_ref_queries(queries_txt_file_wo_top, queries_txt_file_wo_low, queries_txt_file_wo_med, # queries_txt_file_wo_title) # write_queries_to_files(df, corpus, 'cref') # exit() corpus = 'ROBUST' if 'ROBUST' in queries_txt_file else 'ClueWeb12B' if queries_txt_file: qdb = dt.QueriesTextParser(queries_txt_file, 'uqv') df = add_topic_to_qdf(qdb.queries_df) qdb.queries_df = remove_duplicates(qdb) if queries_to_remove: qdb_rm = dt.QueriesTextParser(queries_to_remove) qdb.queries_df = remove_q1_from_q2(qdb_rm.queries_df, qdb) if ap_file: apdb = dt.ResultsReader(ap_file, 'ap') if queries_group != 'title': qdb.queries_df = filter_functions_dict[queries_group]( qdb.queries_df, apdb) elif quant_variants: qdb.queries_df = filter_quant_variants( qdb.queries_df, apdb, quantiles_dict[quant_variants]) if stats: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') calc_statistics(qdb.queries_df, apdb, title_queries_df, title_ap, filter_functions_dict, quantiles_dict, corpus) return elif plot_vars: title_queries_file = dt.ensure_file( f'~/QppUqvProj/data/{corpus}/queries_{corpus}_title.txt') title_queries_df = dt.QueriesTextParser( title_queries_file).queries_df title_ap_file = dt.ensure_file( f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000') title_ap = dt.ResultsReader(title_ap_file, 'ap') plot_variants_ap(qdb.queries_df, apdb, title_queries_df, title_ap, corpus) return print_top_differences(qdb.queries_df, apdb, corpus)