Esempio n. 1
0
    def _calc_general_model_result(self, direct, predictor, sim_func):
        print(f'\n---Generating {predictor}-{sim_func} {direct} results---\n')
        _dict = defaultdict(list)

        def append_to_full_results_dict(_mean, _n):
            _dict['direction'].append(direct)
            _dict['predictor'].append(predictor)
            _dict['sim_func'].append(sim_func)
            _dict['n_vars'].append(_n)
            _dict['result'].append(_mean)

        mean = self.basic_results_dict.get(predictor, None)
        assert mean, f'self.basic_results_dict couldn\'t get {predictor}'
        append_to_full_results_dict(mean, 0)
        _dir = f'{self.results_dir}/{direct}'
        for n in range(1, self.max_n + 1):
            _predictions_dir = dp.ensure_dir(
                f'{_dir}/{n}_vars/general/{sim_func}/{predictor}/predictions')
            cv_obj = InterTopicCrossValidation(
                k=2,
                rep=30,
                folds_map_file=self.cv_map_file,
                predictions_dir=_predictions_dir,
                load=True,
                ap_file=self.query_ap_file,
                test=self.corr_measure)
            mean = cv_obj.calc_test_results()
            append_to_full_results_dict(mean, n)
        _df = pd.DataFrame.from_dict(_dict)
        return _df
Esempio n. 2
0
 def calc_sim_ref_per_predictor(self, predictor):
     _results = defaultdict()
     ref_dir = f'{self.base_dir}/uqvPredictions/referenceLists'
     for quant in ['all', 'med', 'top', 'low', 'low-0']:
         # list to save results for a specific predictor with different quantile variations
         _quant_res = list()
         _index = list()
         for qgroup, query_group in QUERY_GROUPS.items():
             _predictions_dir = os.path.normpath(
                 f'{ref_dir}/{qgroup}/{quant}_vars/sim_as_pred/{predictor}/predictions')
             ap_file = os.path.normpath(f'{self.test_dir}/ref/QLmap1000-{qgroup}')
             ensure_file(ap_file)
             cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_predictions_dir,
                                                folds_map_file=self.cv_map_f, load=True, ap_file=ap_file, test=self.corr_measure)
             mean = cv_obj.calc_test_results()
             _quant_res.append('${}$'.format(mean))
             _index.append(query_group)
         sr = pd.Series(_quant_res)
         sr.name = quant
         sr.index = _index
         _results[quant] = sr
     res_df = pd.DataFrame.from_dict(_results, orient='index')
     res_df = res_df.reindex(index=['all', 'med', 'top', 'low', 'low-0'])
     res_df.index = res_df.index.str.title()
     res_df.index.name = 'Quantile'
     res_df.reset_index(inplace=True)
     res_df = res_df.reindex(['Quantile'] + _index, axis='columns')
     # res_df.rename(QUERY_GROUPS)
     res_df.insert(loc=0, column='Predictor', value=predictor)
     return res_df
Esempio n. 3
0
 def __init__(self,
              predictor,
              corpus,
              corr_measure='pearson',
              aggregation='avg',
              uef=False):
     self.corpus = corpus
     self.predictor = predictor
     self.__set_paths(corpus, predictor, aggregation)
     self.ap_obj = ResultsReader(self.ap_file, 'ap')
     self.working_dir = self.results_dir.replace('predictions', 'ltr')
     self.cv = InterTopicCrossValidation(folds_map_file=self.folds,
                                         predictions_dir=self.results_dir,
                                         test=corr_measure)
     self.folds_df = self.cv.data_sets_map
     _parameters = os.path.normpath(os.path.expanduser(self.parameters))
     self.parameters_df = self.cv.read_eval_results(_parameters)
     self.results_df = self.cv.full_set
     # self.feature_names = ['Jac_coefficient', 'Top_10_Docs_overlap', 'RBO_EXT_100', 'RBO_EXT_1000',
     #                       'RBO_FUSED_EXT_100', 'RBO_FUSED_EXT_1000'] # LTR-many
     self.feature_names = [
         'Jac_coefficient', 'Top_10_Docs_overlap', 'RBO_EXT_100',
         'RBO_FUSED_EXT_100'
     ]  # LTR-few
     features_df = features_loader(self.features, corpus)
     self.features_df = features_df.filter(items=self.feature_names)
Esempio n. 4
0
 def calc_sim_ref_per_group(self, qgroup):
     max_list = []
     _results = defaultdict()
     ref_dir = f'{self.base_dir}/uqvPredictions/referenceLists'
     ap_file = os.path.normpath(f'{self.test_dir}/ref/QLmap1000-{qgroup}')
     ensure_file(ap_file)
     for quant in QUANTILES:
         # list to save results for a specific predictor with different quantile variations
         _quant_res = list()
         _index = list()
         for predictor in SIM_REF_PREDICTORS:
             _predictions_dir = os.path.normpath(
                 f'{ref_dir}/{qgroup}/{quant}_vars/sim_as_pred/{predictor}/predictions')
             cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_predictions_dir,
                                                folds_map_file=self.cv_map_f, load=True, ap_file=ap_file, test=self.corr_measure)
             mean = cv_obj.calc_test_results()
             max_list.append(mean)
             _quant_res.append(mean)
             _index.append(SIM_REF_PREDICTORS[predictor])
         sr = pd.Series(_quant_res)
         sr.name = quant
         sr.index = _index
         _results[quant] = sr
     res_df = pd.DataFrame.from_dict(_results, orient='index')
     res_df = res_df.reindex(index=QUANTILES)
     res_df.index = res_df.index.str.title()
     res_df.index.name = 'Quantile'
     res_df.reset_index(inplace=True)
     res_df.insert(loc=0, column='Uniform', value='-')
     res_df = res_df.reindex(['Quantile'] + REFERENCE_TITLES, axis='columns')
     res_df.insert(loc=1, column=QUERY_GROUPS[qgroup], value='-')
     res_df.insert(loc=0, column='Predictor', value='SimilarityOnly')
     return res_df, max(max_list)
Esempio n. 5
0
 def calc_single_query_result(self, predictor):
     print(f'\n---Generating {predictor} 0 vars results---\n')
     _predictions_dir = dp.ensure_dir(
         f'{self.basic_predictions_dir}/{predictor}/predictions')
     cv_obj = InterTopicCrossValidation(k=2,
                                        rep=30,
                                        folds_map_file=self.cv_map_file,
                                        predictions_dir=_predictions_dir,
                                        load=True,
                                        ap_file=self.query_ap_file,
                                        test=self.corr_measure)
     mean = cv_obj.calc_test_results()
     self.basic_results_dict[predictor] = mean
Esempio n. 6
0
    def calc_single(self, single_f):
        test_dir = os.path.normpath('{}/ref'.format(self.test_dir))
        predictions_dir = '{}/basicPredictions/{}'.format(self.base_dir, single_f)
        ap_score = ensure_file('{}/QLmap1000-{}'.format(test_dir, single_f))

        _results = defaultdict()

        for p in PREDICTORS + UEF_PREDICTORS:
            # dir of aggregated prediction results
            _predictions_dir = os.path.normpath('{}/{}/predictions'.format(predictions_dir, p))
            # dir of aggregated uef prediction results
            # _uef_predictions_dir = os.path.normpath('{}/uef/{}/predictions'.format(predictions_dir, p))
            # list to save non uef results for a specific predictor with different AP files
            _p_res = list()
            # list to save uef results
            # _uef_p_res = list()
            _index = list()
            for measure in CORR_MEASURES:
                cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_predictions_dir,
                                                   folds_map_file=self.cv_map_f, load=True, test=measure,
                                                   ap_file=ap_score)
                # uef_cv_obj = CrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_uef_predictions_dir,
                #                              file_to_load=self.cv_map_f, load=True, test=measure,
                #                              ap_file=ap_score)
                mean = cv_obj.calc_test_results()
                # uef_mean = uef_cv_obj.calc_test_results()
                _p_res.append(float(mean))
                # _uef_p_res.append('${}$'.format(uef_mean))
                _index.append(measure)

            sr = pd.Series(_p_res)
            # uef_sr = pd.Series(_uef_p_res)
            sr.name = p
            sr.index = _index
            # uef_p = 'uef({})'.format(p)
            # uef_sr.name = uef_p
            # uef_sr.index = _index
            _results[p] = sr
            # _results[uef_p] = uef_sr

        res_df = pd.DataFrame.from_dict(_results, orient='index')
        # _uef_predictors = ['uef({})'.format(p) for p in PREDICTORS]
        res_df = res_df.reindex(index=PREDICTORS + UEF_PREDICTORS)
        res_df.index = res_df.index.str.upper()
        res_df.index.name = 'Predictor'
        res_df.reset_index(inplace=True)
        # res_df.columns = ['Predictor'] + CORR_MEASURES
        res_df = res_df.reindex(['Predictor'] + CORR_MEASURES, axis='columns')
        res_df.insert(loc=0, column='Function', value=single_f)
        return res_df
Esempio n. 7
0
    def calc_aggregated(self, aggregation):
        test_dir = os.path.normpath('{}/aggregated'.format(self.test_dir))
        predictions_dir = '{}/uqvPredictions/aggregated/{}'.format(self.base_dir, aggregation)
        _results = defaultdict()

        for p in PREDICTORS + UEF_PREDICTORS:
            # dir of aggregated prediction results
            _predictions_dir = os.path.normpath('{}/{}/predictions'.format(predictions_dir, p))
            # dir of aggregated uef prediction results
            # _uef_predictions_dir = os.path.normpath('{}/uef/{}/predictions'.format(predictions_dir, p))
            # list to save non uef results for a specific predictor with different AP files
            _p_res = list()
            # list to save uef results
            # _uef_p_res = list()
            _index = list()
            for agg in AGGREGATE_FUNCTIONS + ['combsum']:
                ap_score = ensure_file('{}/map1000-{}'.format(test_dir, agg))
                cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_predictions_dir,
                                                   folds_map_file=self.cv_map_f, load=True, test=self.corr_measure,
                                                   ap_file=ap_score)
                # uef_cv_obj = CrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_uef_predictions_dir,
                #                              file_to_load=self.cv_map_f, load=True, test=self.corr_measure,
                #                              ap_file=ap_score)
                mean = cv_obj.calc_test_results()
                # uef_mean = uef_cv_obj.calc_test_results()
                # _p_res.append('${}$'.format(mean))
                _p_res.append(mean)
                # _uef_p_res.append('${}$'.format(uef_mean))
                _index.append(agg)

            sr = pd.Series(_p_res)
            # uef_sr = pd.Series(_uef_p_res)
            sr.name = p
            sr.index = _index
            # uef_p = 'uef({})'.format(p)
            # uef_sr.name = uef_p
            # uef_sr.index = _index
            _results[p] = sr
            # _results[uef_p] = uef_sr

        res_df = pd.DataFrame.from_dict(_results, orient='index')
        # _uef_predictors = ['uef({})'.format(p) for p in PREDICTORS]
        res_df = res_df.reindex(index=PREDICTORS + UEF_PREDICTORS)
        res_df.index.name = 'predictor'
        res_df.index = res_df.index.str.upper()
        res_df.reset_index(inplace=True)
        res_df.insert(loc=0, column='predictor-agg', value=aggregation)
        # res_df.columns = ['predictor-agg', 'predictor'] + AGGREGATE_FUNCTIONS + ['combsum']
        res_df = res_df.reindex(['predictor-agg', 'predictor'] + AGGREGATE_FUNCTIONS + ['combsum'], axis='columns')
        return res_df
Esempio n. 8
0
 def __init__(self, corpus, predictor, load=False):
     self.corpus = corpus
     self.__set_paths(corpus, predictor)
     self.similarity_features_df = self.__initialize_features_df()
     self.norm_similarity_features_df = self.__normalize_similarity()
     self.similarity_measures = self.similarity_features_df.columns.tolist()
     self.var_cv = InterTopicCrossValidation(
         folds_map_file=self.folds, predictions_dir=self.vars_results_dir)
     # self.norm_prediction_scores = self.__normalize_predictions()
     self.raw_prediction_scores = self.__raw_predictions()
     self.prediction_scores = self.var_cv.full_set.columns.tolist()
     self.full_raw_weights_df = self.__initialize_full_weights_df()
     self.dict_all_options = self._set_weights()
     if load:
         try:
             # Will try loading a dictionary, if fails will generate and save a new one
             file_to_load = dp.ensure_file(
                 f'{self.res_dir}/test/pageRank/pkl_files/{predictor}/dict_all_options_stochastic.pkl'
             )
             with open(file_to_load, 'rb') as handle:
                 self.dict_all_options_stochastic = pickle.load(handle)
         except AssertionError:
             self.dict_all_options_stochastic = self._normalize_rows()
             self.__save_new_dictionary(corpus, predictor)
     else:
         self.dict_all_options_stochastic = self._normalize_rows()
         self.__save_new_dictionary(corpus, predictor)
Esempio n. 9
0
def init_eval(corpus, similarity, predictor):
    pth_dict = set_basic_paths(corpus)
    predictor_pkl_dir = dp.ensure_dir(f"{pth_dict['pkl_dir']}/{predictor}")
    predictions_dir = dp.ensure_dir(
        f'{pth_dict["res_dir"]}/{corpus}/uqvPredictions/referenceLists/pageRank/raw/{similarity}/{predictor}/predictions')
    ap_obj = dp.ResultsReader(pth_dict['ap_file'], 'ap')
    ap_df = add_topic_to_qdf(ap_obj.data_df)
    cv_obj = InterTopicCrossValidation(predictions_dir=predictions_dir, folds_map_file=pth_dict['cv_folds'])
    full_results_df = add_topic_to_qdf(cv_obj.full_set)
    return {'predictor_pkl_dir': predictor_pkl_dir, 'ap_obj': ap_obj, 'ap_df': ap_df,
            'full_results_df': full_results_df, 'cv_obj': cv_obj}
Esempio n. 10
0
    def __init__(self, predictor, corpus, qgroup, vars_quantile, **kwargs):
        graphs = kwargs.get('graphs', None)
        if graphs:
            n = kwargs.get('n', None)
            assert n, 'Missing number of vars'
            self.__set_graph_paths(corpus, predictor, qgroup, graphs, n)
        else:
            self.__set_paths(corpus, predictor, qgroup, vars_quantile)
        _q2p_obj = dp.QueriesTextParser(self.queries2predict_file, 'uqv')
        self.var_cv = InterTopicCrossValidation(
            folds_map_file=self.folds, predictions_dir=self.vars_results_dir)
        _vars_results_df = self.var_cv.full_set
        # Initialize the base prediction results of the queries to be predicted
        if qgroup == 'title':
            _base_cv = InterTopicCrossValidation(
                folds_map_file=self.folds,
                predictions_dir=self.base_results_dir)
            self.base_results_df = _base_cv.full_set
        else:
            self.base_results_df = dp.convert_vid_to_qid(
                _vars_results_df.loc[_q2p_obj.queries_dict.keys()])

        self.base_results_df.rename_axis('topic', inplace=True)
        # The next function is used to save results in basic predictions format of the given queries set
        # write_basic_predictions(self.base_results_df, corpus, qgroup, predictor)
        self.query_vars = dp.QueriesTextParser(self.query_vars_file, 'uqv')
        _quantile_vars = dp.QueriesTextParser(self.quantile_vars_file, 'uqv')
        _features_df = features_loader(self.features, corpus)
        self.features_df = self.__initialize_features_df(
            _quantile_vars, _features_df)
        self.var_scores_df = self.__initialize_var_scores_df(
            _features_df.reset_index()[['topic', 'qid']], _vars_results_df)
        self.geo_mean_df = self.__initialize_geo_scores_df(
            _features_df.reset_index()[['topic', 'qid']],
            dp.ResultsReader(self.geo_mean_file, 'predictions').data_df)
        self.real_ap_df = self.__initialize_var_scores_df(
            _features_df.reset_index()[['topic', 'qid']],
            dp.ResultsReader(self.real_ap_file, 'ap').data_df)
        self.geo_as_predictor()
Esempio n. 11
0
    def calc_reference_per_predictor(self, predictor, query_group, oracle=False):
        max_list = []
        _results = defaultdict()

        ap_file = os.path.normpath(f'{self.test_dir}/ref/QLmap1000-{query_group}')
        ensure_file(ap_file)
        base_pred_dir = f'{self.base_dir}/basicPredictions/{query_group}'

        """This part calculates and adds title queries column"""
        _base_predictions_dir = os.path.normpath('{}/{}/predictions'.format(base_pred_dir, predictor))
        cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_base_predictions_dir,
                                           folds_map_file=self.cv_map_f, load=True, ap_file=ap_file, test=self.corr_measure)
        _mean = cv_obj.calc_test_results()
        max_list.append(_mean)

        for quant in QUANTILES:

            if oracle:
                predictions_dir = f'{self.base_dir}/uqvPredictions/referenceLists/{query_group}/{quant}_vars/oracle'
            else:
                predictions_dir = f'{self.base_dir}/uqvPredictions/referenceLists/{query_group}/{quant}_vars/general'
            # list to save results for a specific predictor with different quantile variations
            _quant_res = [_mean]
            _index = [QUERY_GROUPS[query_group]]

            for ref_func, func_name in zip(REFERENCE_FUNCTIONS, REFERENCE_TITLES):
                _predictions_dir = os.path.normpath(f'{predictions_dir}/{ref_func}/{predictor}/predictions')
                _uef_predictions_dir = os.path.normpath(f'{predictions_dir}/{ref_func}/uef/{predictor}/predictions')
                cv_obj = InterTopicCrossValidation(k=SPLITS, rep=REPEATS, predictions_dir=_predictions_dir,
                                                   folds_map_file=self.cv_map_f, load=True, ap_file=ap_file, test=self.corr_measure)
                mean = cv_obj.calc_test_results()
                max_list.append(mean)
                _quant_res.append(mean)
                _index.append(func_name)

            sr = pd.Series(_quant_res)
            sr.index = _index
            sr.name = quant
            _results[quant] = sr

        res_df = pd.DataFrame.from_dict(_results, orient='index')
        res_df = res_df.reindex(index=QUANTILES)
        res_df.index = res_df.index.str.title()
        res_df.index.name = 'Quantile'
        res_df.reset_index(inplace=True)
        res_df = res_df.reindex(['Quantile', QUERY_GROUPS[query_group]] + REFERENCE_TITLES, axis='columns')
        res_df.insert(loc=0, column='Predictor', value=predictor.upper())
        return res_df, max(max_list)
Esempio n. 12
0
def calc_s(cv_obj: InterTopicCrossValidation, full_scores_df: pd.DataFrame):
    if hasattr(cv_obj, 'corr_df'):
        cv_obj.__delattr__('corr_df')
    cv_obj.full_set = full_scores_df
    score = cv_obj.calc_test_results()
    return float(score)
Esempio n. 13
0
 def _calc_cv_result(self, similarity, predictor, lambda_param):
     predictions_dir = self.results_dirs_dict.get((similarity, predictor, lambda_param))
     cv_obj = InterTopicCrossValidation(k=2, rep=30, folds_map_file=self.cv_map_file, predictions_dir=predictions_dir, load=True,
                                        ap_file=self.query_ap_file, test=self.corr_measure)
     mean = cv_obj.calc_test_results()
     return mean
Esempio n. 14
0
class LearningDataSets:
    def __init__(self,
                 predictor,
                 corpus,
                 corr_measure='pearson',
                 aggregation='avg',
                 uef=False):
        self.corpus = corpus
        self.predictor = predictor
        self.__set_paths(corpus, predictor, aggregation)
        self.ap_obj = ResultsReader(self.ap_file, 'ap')
        self.working_dir = self.results_dir.replace('predictions', 'ltr')
        self.cv = InterTopicCrossValidation(folds_map_file=self.folds,
                                            predictions_dir=self.results_dir,
                                            test=corr_measure)
        self.folds_df = self.cv.data_sets_map
        _parameters = os.path.normpath(os.path.expanduser(self.parameters))
        self.parameters_df = self.cv.read_eval_results(_parameters)
        self.results_df = self.cv.full_set
        # self.feature_names = ['Jac_coefficient', 'Top_10_Docs_overlap', 'RBO_EXT_100', 'RBO_EXT_1000',
        #                       'RBO_FUSED_EXT_100', 'RBO_FUSED_EXT_1000'] # LTR-many
        self.feature_names = [
            'Jac_coefficient', 'Top_10_Docs_overlap', 'RBO_EXT_100',
            'RBO_FUSED_EXT_100'
        ]  # LTR-few
        features_df = features_loader(self.features, corpus)
        self.features_df = features_df.filter(items=self.feature_names)

    @classmethod
    def __set_paths(cls, corpus, predictor, agg):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""

        _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/'
        _base_dir = os.path.normpath(os.path.expanduser(_base_dir))
        cls.parameters = '{}/aggregated/{}/{}/evaluation/full_results_vector_for_2_folds_30_repetitions_{}.json'.format(
            _base_dir, agg, predictor, agg)
        cls.results_dir = '{}/raw/{}/predictions/'.format(_base_dir, predictor)
        cls.output_dir = '{}/aggregated/{}/{}/ltr/datasets/'.format(
            _base_dir, agg, predictor)
        ensure_dir(cls.output_dir)
        _test_dir = f'~/QppUqvProj/Results/{corpus}/test/'
        _test_dir = os.path.normpath(os.path.expanduser(_test_dir))
        cls.folds = '{}/2_folds_30_repetitions.json'.format(_test_dir)
        cls.features = '{}/raw/norm_features_{}_uqv.JSON'.format(
            _test_dir, corpus)
        cls.ap_file = '{}/aggregated/map1000-{}'.format(_test_dir, agg)

    def _create_data_set(self, param):
        predictor_resutls = self.results_df[f'score_{param}']
        feat_df = self.features_df.multiply(predictor_resutls,
                                            axis=0,
                                            level='qid')
        feat_df = feat_df.groupby('topic').sum()
        feat_df = feat_df.apply(np.log)
        feat_df = feat_df.merge(self.ap_obj.data_df,
                                left_index=True,
                                right_index=True)
        feat_df.insert(0, 'qid', 'qid:1')
        return feat_df

    def _split_data_set(self, dataset_df, set_id, subset):
        set_id = int(set_id)
        train = np.array(self.folds_df[set_id][subset]['train']).astype(str)
        test = np.array(self.folds_df[set_id][subset]['test']).astype(str)
        return dataset_df.loc[train], dataset_df.loc[test]

    def _df_to_str(self, df):
        formatters = {}
        for i, feat in enumerate(self.feature_names):
            j = i + 1
            s = f'{j}' + ':{:f}'
            formatters[feat] = s.format

        _df = df.to_string(columns=[
            'ap',
            'qid',
        ] + self.feature_names,
                           index=False,
                           index_names=False,
                           header=False,
                           float_format='%f',
                           formatters=formatters)
        return _df

    def generate_data_sets_fine_tune(self):
        """This method will create the data sets with all the available hyper parameters of the qpp predictions"""
        run(f'rm -rfv {self.output_dir}*', shell=True)
        for set_id in self.parameters_df.index:
            for subset in ['a', 'b']:
                for col in self.results_df.columns:
                    h = col.split('_')[-1]
                    features_df = self._create_data_set(h)
                    train_df, test_df = self._split_data_set(
                        features_df, set_id, subset)
                    train_str = self._df_to_str(train_df)
                    test_str = self._df_to_str(test_df)
                    self.write_str_to_file(
                        train_str, f'train_{set_id}_{subset}-d_{h}.dat')
                    self.write_str_to_file(
                        test_str, f'test_{set_id}_{subset}-d_{h}.dat')

    def generate_data_sets(self):
        """This method will create the data sets with a single hyper parameter for the qpp predictions, which will be
        chosen based on the best result on the train set"""
        run(f'rm -rfv {self.output_dir}*', shell=True)
        for set_id in self.parameters_df.index:
            for subset in ['a', 'b']:
                param = self.parameters_df.loc[set_id][subset]
                features_df = self._create_data_set(param)
                train_df, test_df = self._split_data_set(
                    features_df, set_id, subset)
                train_str = self._df_to_str(train_df)
                test_str = self._df_to_str(test_df)
                self.write_str_to_file(train_str,
                                       f'train_{set_id}_{subset}.dat')
                self.write_str_to_file(test_str, f'test_{set_id}_{subset}.dat')

    def write_str_to_file(self, string, file_name):
        with open(self.output_dir + file_name, "w") as text_file:
            print(string, file=text_file)

    def run_svm_fine_tune(self):
        svm_learn = '~/svmRank/svm_rank_learn'
        svm_classify = '~/svmRank/svm_rank_classify'
        models_dir = self.output_dir.replace('datasets', 'models')
        ensure_dir(models_dir)
        classification_dir = self.output_dir.replace('datasets',
                                                     'classifications')
        ensure_dir(classification_dir)
        run(f'rm -rfv {models_dir}*', shell=True)
        run(f'rm -rfv {classification_dir}*', shell=True)
        train_sets = glob.glob(f'{self.output_dir}/train*')
        for c in C_list:
            for trainset in train_sets:
                testset = trainset.replace('train', 'test')
                _model_params = trainset.strip('.dat').split('_', 1)[-1]
                _model_path = f'{models_dir}model_{_model_params}_c_{c}'
                _cls_train_path = f'{classification_dir}train_{_model_params}_c_{c}.cls'
                _cls_test_path = f'{classification_dir}test_{_model_params}_c_{c}.cls'
                run('{0} -c {1} {2} {3}'.format(svm_learn, c, trainset,
                                                _model_path),
                    shell=True)
                run('{0} {1} {2} {3}'.format(svm_classify, trainset,
                                             _model_path, _cls_train_path),
                    shell=True)
                run('{0} {1} {2} {3}'.format(svm_classify, testset,
                                             _model_path, _cls_test_path),
                    shell=True)

    def run_svm(self):
        c = '1'
        svm_learn = '~/svmRank/svm_rank_learn'
        svm_classify = '~/svmRank/svm_rank_classify'
        models_dir = self.output_dir.replace('datasets', 'models')
        ensure_dir(models_dir)
        classification_dir = self.output_dir.replace('datasets',
                                                     'classifications')
        run(f'rm -rfv {models_dir}*', shell=True)
        run(f'rm -rfv {classification_dir}*', shell=True)
        ensure_dir(classification_dir)
        for set_id in range(1, 31):
            for subset in ['a', 'b']:
                run('{0} -c {1} {2}/train_{3}_{4}.dat {5}/model_{3}_{4}'.
                    format(svm_learn, c, self.output_dir, set_id, subset,
                           models_dir),
                    shell=True)
                run('{0} {1}/test_{2}_{3}.dat {4}/model_{2}_{3} {5}/predictions_{2}_{3}'
                    .format(svm_classify, self.output_dir, set_id, subset,
                            models_dir, classification_dir),
                    shell=True)

    @staticmethod
    def _df_from_files(files):
        _list = []
        for file in files:
            _str = file.split('_', 1)[-1]
            _params = _str.strip('.cls').split('-', 1)[-1]
            _df = pd.read_csv(file, header=None, names=[_params])
            _list.append(_df)
        return pd.concat(_list, axis=1)

    def cross_val_fine_tune(self):
        classification_dir = self.output_dir.replace('datasets',
                                                     'classifications')
        _list = []
        _dict = {}
        for set_id in range(1, 31):
            _pair = []
            for subset in ['a', 'b']:
                train_files = glob.glob(classification_dir +
                                        f'train_{set_id}_{subset}-*')
                _train_df = self._df_from_files(train_files)
                _train_topics = np.array(
                    self.folds_df[set_id][subset]['train']).astype(str)
                _train_df.insert(loc=0, column='qid', value=_train_topics)
                _train_df.set_index('qid', inplace=True)
                _ap_df = self.ap_obj.data_df.loc[_train_topics]
                _df = _train_df.merge(_ap_df, how='outer', on='qid')
                _correlation_df = _df.corr(method=self.cv.test)
                _corr = _correlation_df.drop('ap')['ap']
                max_train_param = _corr.idxmax()
                _test_file = classification_dir + f'test_{set_id}_{subset}-{max_train_param}.cls'
                _test_df = pd.read_csv(_test_file,
                                       header=None,
                                       names=['score'])
                _test_topics = np.array(
                    self.folds_df[set_id][subset]['test']).astype(str)
                _test_df.insert(loc=0, column='qid', value=_test_topics)
                _test_df.set_index('qid', inplace=True)
                _ap_df = self.ap_obj.data_df.loc[_test_topics]
                _df = _test_df.merge(_ap_df, how='outer', on='qid')
                _correlation = _df['score'].corr(_df['ap'],
                                                 method=self.cv.test)
                _pair.append(_correlation)
            _list.append(np.mean(_pair))
        print('mean: {:.3f}'.format(np.mean(_list)))

    def cross_val(self):
        simple_results = {}
        classification_dir = self.output_dir.replace('datasets',
                                                     'classifications')
        eval_dir = ensure_dir(self.output_dir.replace('datasets',
                                                      'evaluation'))
        _list = []
        for set_id in range(1, 31):
            _pair = []
            for subset in ['a', 'b']:
                _res_df = pd.read_csv(
                    f'{classification_dir}/predictions_{set_id}_{subset}',
                    header=None,
                    names=['score'])
                _test_topics = np.array(
                    self.folds_df[set_id][subset]['test']).astype(str)
                _res_df.insert(loc=0, column='qid', value=_test_topics)
                _res_df.set_index('qid', inplace=True)
                _ap_df = self.ap_obj.data_df.loc[_test_topics]
                _df = _res_df.merge(_ap_df, how='outer', on='qid')
                _correlation = _df['score'].corr(_df['ap'],
                                                 method=self.cv.test)
                _pair.append(_correlation)
            avg_res = np.mean(_pair)
            _list.append(avg_res)
            simple_results['set {}'.format(set_id)] = avg_res
        simple_results_df = pd.Series(simple_results)
        simple_results_df.to_json(
            ('{}/simple_results_vector_for_2_folds_30_repetitions_ltr.json'.
             format(eval_dir)))
        print('mean: {:.3f}'.format(np.mean(_list)))
        if check_significance(self.corpus, self.predictor):
            print('significant!')
        else:
            print('Not significant!')