Example #1
0
def check_significance(corpus, predictor, alpha=0.05):
    _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/aggregated/avg/'
    baseline_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/evaluation/')
    baseline_file = dp.ensure_file(
        f'{baseline_dir}/simple_results_vector_for_2_folds_30_repetitions_avg.json'
    )
    with open(baseline_file) as json_data:
        data = json.load(json_data)
    baseline_sr = pd.DataFrame.from_dict(data,
                                         orient='index',
                                         columns=['correlation'],
                                         dtype=float)

    candidate_dir = dp.ensure_dir(f'{_base_dir}/{predictor}/ltr/evaluation/')
    candidate_file = dp.ensure_file(
        f'{candidate_dir}/simple_results_vector_for_2_folds_30_repetitions_ltr.json'
    )
    with open(candidate_file) as json_data:
        data = json.load(json_data)
    candidate_sr = pd.DataFrame.from_dict(data,
                                          orient='index',
                                          columns=['correlation'],
                                          dtype=float)
    print(f'baseline: {baseline_sr.mean()[0]:.3f}')
    return t_test(baseline_sr, candidate_sr, alpha)
Example #2
0
 def run_svm_fine_tune(self):
     svm_learn = '~/svmRank/svm_rank_learn'
     svm_classify = '~/svmRank/svm_rank_classify'
     models_dir = self.output_dir.replace('datasets', 'models')
     ensure_dir(models_dir)
     classification_dir = self.output_dir.replace('datasets',
                                                  'classifications')
     ensure_dir(classification_dir)
     run(f'rm -rfv {models_dir}*', shell=True)
     run(f'rm -rfv {classification_dir}*', shell=True)
     train_sets = glob.glob(f'{self.output_dir}/train*')
     for c in C_list:
         for trainset in train_sets:
             testset = trainset.replace('train', 'test')
             _model_params = trainset.strip('.dat').split('_', 1)[-1]
             _model_path = f'{models_dir}model_{_model_params}_c_{c}'
             _cls_train_path = f'{classification_dir}train_{_model_params}_c_{c}.cls'
             _cls_test_path = f'{classification_dir}test_{_model_params}_c_{c}.cls'
             run('{0} -c {1} {2} {3}'.format(svm_learn, c, trainset,
                                             _model_path),
                 shell=True)
             run('{0} {1} {2} {3}'.format(svm_classify, trainset,
                                          _model_path, _cls_train_path),
                 shell=True)
             run('{0} {1} {2} {3}'.format(svm_classify, testset,
                                          _model_path, _cls_test_path),
                 shell=True)
Example #3
0
def write_basic_predictions(df: pd.DataFrame, corpus, qgroup, predictor):
    """The function is used to save results in basic predictions format of a given queries set"""
    for col in df.columns:
        _file_path = f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}/{predictor}/predictions/'
        dp.ensure_dir(os.path.normpath(os.path.expanduser(_file_path)))
        _file_name = col.replace('score_', 'predictions-')
        file_name = f'{_file_path}{_file_name}'
        df[col].to_csv(file_name, sep=" ", header=False, index=True)
Example #4
0
 def _cp_result_file_to_dirs(self):
     destination_dirs = defaultdict(str)
     for lam in LAMBDA:
         for sim, pred in itertools.product(SIMILARITY_FUNCTIONS.values(), PREDICTORS):
             dest_dir = dp.ensure_dir(f'{self.results_dir}/{sim}/{pred}/lambda-{lam}/predictions')
             destination_dirs[sim, pred, f'{lam:.2f}'] = dest_dir
             src_dir = dp.ensure_dir(f'{self.raw_res_base_dir}/{sim}/{pred}/predictions')
             prediction_files = glob(f'{src_dir}/predictions-*+lambda+{lam}')
             for _file in prediction_files:
                 copy2(_file, dest_dir)
     return destination_dirs
Example #5
0
 def __set_paths(self):
     """This method sets the default paths of the files and the working directories, it assumes the standard naming
      convention of the project"""
     _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{self.corpus}')
     _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{self.corpus}')
     self.ql_results_file = dp.ensure_file(
         f'{_corpus_res_dir}/test/raw/QL.res')
     self.queries_txt_file = dp.ensure_file(
         f'{_corpus_dat_dir}/queries_{self.corpus}_UQV_full.stemmed.txt')
     # self.predictions_dir = dp.ensure_dir(f'{_corpus_res_dir}/uqvPredictions/raw/{self.predictor}')
     self.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/pkl_files/')
Example #6
0
    def __set_paths(cls, corpus, predictor, qgroup, vars_quantile):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        cls.predictor = predictor

        _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/'
        cls.vars_results_dir = dp.ensure_dir(
            f'{_base_dir}/raw/{predictor}/predictions/')

        if qgroup == 'title':
            _orig_dir = dp.ensure_dir(
                f'~/QppUqvProj/Results/{corpus}/basicPredictions/title')
            cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/'

        cls.output_dir = dp.ensure_dir(
            f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/')

        _test_dir = f'~/QppUqvProj/Results/{corpus}/test'
        cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json')

        cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}')

        # cls.features = '{}/raw/query_features_{}_uqv_legal.JSON'.format(_test_dir, corpus)
        # cls.features = f'{_test_dir}/ref/{qgroup}_query_features_{corpus}_uqv.JSON'
        cls.features = dp.ensure_file(
            f'{_test_dir}/ref/{qgroup}_query_{vars_quantile}_variations_features_{corpus}_uqv.JSON'
        )

        cls.geo_mean_file = dp.ensure_file(
            f'{_base_dir}/raw/geo/predictions/predictions-20000')

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _query_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{qgroup}.txt'
        cls.query_vars_file = os.path.normpath(os.path.expanduser(_query_vars))
        dp.ensure_file(cls.query_vars_file)

        _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt'
        cls.queries2predict_file = dp.ensure_file(_queries2predict)

        if vars_quantile == 'all':
            cls.quantile_vars_file = cls.query_vars_file
        else:
            _quantile_vars = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_{vars_quantile}_variants.txt'
            cls.quantile_vars_file = os.path.normpath(
                os.path.expanduser(_quantile_vars))
            dp.ensure_file(cls.quantile_vars_file)

        cls.real_ap_file = dp.ensure_file(
            f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')

        cls.geo_predictions_dir = dp.ensure_dir(
            f'{_base_dir}/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/geo/predictions'
        )
Example #7
0
 def __init__(self, qpp_ref: QueryPredictionRef, corr_measure='pearson'):
     self.corr_measure = corr_measure
     _predictor = qpp_ref.predictor
     self.features_df = qpp_ref.features_df
     self.results_df = qpp_ref.var_scores_df
     _ap_file = qpp_ref.ap_file
     self.ap_obj = dp.ResultsReader(_ap_file, 'ap')
     self.folds_df = qpp_ref.var_cv.data_sets_map.transpose()
     self.output_dir = f'{qpp_ref.output_dir}/ltr/{_predictor}/'
     dp.ensure_dir(self.output_dir)
     self.calc_features_df = qpp_ref.calc_integrated
     self.feature_names = self.features_df.columns.tolist()
     self.cpu_cores = mp.cpu_count() - 1
Example #8
0
def main(args):
    corpus = args.corpus

    # corpus = 'ROBUST'

    if not corpus:
        return

    queries_file = dp.ensure_file(
        f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_full.txt')
    rm_probabilities_dir = dp.ensure_dir(
        f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/RMprob')

    # queries_file = dp.ensure_file(f'~/QppUqvProj/data/{corpus}/queries.txt')
    # rm_probabilities_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/RMprob')

    queries_obj = dp.QueriesTextParser(queries_file)
    rm_probabilities_df = dp.read_rm_prob_files(rm_probabilities_dir,
                                                number_of_docs=20000,
                                                clipping='*')

    uqv = True if 'uqv' in queries_file.split('/')[-1].lower() else False

    results_df = geo_mean(queries_obj, rm_probabilities_df)
    write_predictions(results_df, corpus, uqv)
Example #9
0
def write_predictions(df, corpus, uqv):
    if uqv:
        _dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions'
        )
    else:
        _dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/geo/predictions'
        )
    for col in df:
        file_name = f'{_dir}/predictions-{col}'
        df[col].to_csv(file_name,
                       sep=" ",
                       header=False,
                       index=True,
                       float_format='%f')
Example #10
0
    def _calc_general_model_result(self, direct, predictor, sim_func):
        print(f'\n---Generating {predictor}-{sim_func} {direct} results---\n')
        _dict = defaultdict(list)

        def append_to_full_results_dict(_mean, _n):
            _dict['direction'].append(direct)
            _dict['predictor'].append(predictor)
            _dict['sim_func'].append(sim_func)
            _dict['n_vars'].append(_n)
            _dict['result'].append(_mean)

        mean = self.basic_results_dict.get(predictor, None)
        assert mean, f'self.basic_results_dict couldn\'t get {predictor}'
        append_to_full_results_dict(mean, 0)
        _dir = f'{self.results_dir}/{direct}'
        for n in range(1, self.max_n + 1):
            _predictions_dir = dp.ensure_dir(
                f'{_dir}/{n}_vars/general/{sim_func}/{predictor}/predictions')
            cv_obj = InterTopicCrossValidation(
                k=2,
                rep=30,
                folds_map_file=self.cv_map_file,
                predictions_dir=_predictions_dir,
                load=True,
                ap_file=self.query_ap_file,
                test=self.corr_measure)
            mean = cv_obj.calc_test_results()
            append_to_full_results_dict(mean, n)
        _df = pd.DataFrame.from_dict(_dict)
        return _df
Example #11
0
def load_full_features_df(**kwargs):
    """
    :param kwargs: corpus, queries_group, quantile or features_factory_obj: QueryFeatureFactory() object
    :return: pd.DataFrame that contains all the features values
    """
    corpus = kwargs.get('corpus', None)
    queries_group = kwargs.get('queries_group', None)
    quantile = kwargs.get('quantile', None)
    features_factory_obj = kwargs.get('features_factory_obj', None)
    if features_factory_obj:
        features_obj = features_factory_obj
        corpus = features_obj.corpus
        queries_group = features_obj.queries_group
    else:
        assert corpus and queries_group and quantile, f"Can't create a factory object from Corpus={corpus}, " \
                                                      f"Queries group={queries_group}, Variations Quantile={quantile}"
        features_obj = RefQueryFeatureFactory(corpus, queries_group, quantile)
    pkl_dir = dp.ensure_dir(
        f'~/QppUqvProj/Results/{corpus}/test/ref/pkl_files/')
    _list = []
    last_df = pd.DataFrame()
    for n in NUMBER_OF_DOCS:
        _file = f'{pkl_dir}/{queries_group}_queries_{corpus}_RBO_{n}_TopDocs_{n}.pkl'
        try:
            dp.ensure_file(_file)
            _df = pd.read_pickle(_file).set_index(['topic', 'qid'])
            _df[f'Top_{n}_Docs_overlap'] = _df[f'Top_{n}_Docs_overlap'] / n
            _list.append(_df.drop('Jac_coefficient', axis=1))
            last_df = _df['Jac_coefficient']
        except AssertionError:
            print(f'!! Warning !! The file {_file} is missing')
    df = pd.concat(_list + [last_df], axis=1)
    return features_obj.divide_by_size(df)
Example #12
0
 def __init__(self, folds_map_file=None, k=2, rep=30, predictions_dir=None, test='pearson', ap_file=None,
              generate_folds=False, **kwargs):
     logging.debug("testing logger")
     self.k = k
     self.rep = rep
     self.test = test
     assert predictions_dir, 'Specify predictions dir'
     assert folds_map_file, 'Specify path for CV folds file'
     predictions_dir = os.path.abspath(os.path.normpath(os.path.expanduser(predictions_dir)))
     assert os.listdir(predictions_dir), f'{predictions_dir} is empty'
     self.output_dir = dp.ensure_dir(predictions_dir.replace('predictions', 'evaluation'))
     if ap_file:
         self.full_set = self._build_full_set(predictions_dir, ap_file)
         if '-' in ap_file:
             self.ap_func = ap_file.split('-')[-1]
         else:
             self.ap_func = 'basic'
     else:
         self.full_set = self._build_full_set(predictions_dir)
     if generate_folds:
         self.index = self.full_set.index
         self.folds_file = self._generate_k_folds()
         self.__load_k_folds()
     else:
         try:
             self.folds_file = dp.ensure_file(folds_map_file)
         except FileExistsError:
             print("The folds file specified doesn't exist, going to generate the file and save")
         self.__load_k_folds()
Example #13
0
 def cross_val(self):
     simple_results = {}
     classification_dir = self.output_dir.replace('datasets',
                                                  'classifications')
     eval_dir = ensure_dir(self.output_dir.replace('datasets',
                                                   'evaluation'))
     _list = []
     for set_id in range(1, 31):
         _pair = []
         for subset in ['a', 'b']:
             _res_df = pd.read_csv(
                 f'{classification_dir}/predictions_{set_id}_{subset}',
                 header=None,
                 names=['score'])
             _test_topics = np.array(
                 self.folds_df[set_id][subset]['test']).astype(str)
             _res_df.insert(loc=0, column='qid', value=_test_topics)
             _res_df.set_index('qid', inplace=True)
             _ap_df = self.ap_obj.data_df.loc[_test_topics]
             _df = _res_df.merge(_ap_df, how='outer', on='qid')
             _correlation = _df['score'].corr(_df['ap'],
                                              method=self.cv.test)
             _pair.append(_correlation)
         avg_res = np.mean(_pair)
         _list.append(avg_res)
         simple_results['set {}'.format(set_id)] = avg_res
     simple_results_df = pd.Series(simple_results)
     simple_results_df.to_json(
         ('{}/simple_results_vector_for_2_folds_30_repetitions_ltr.json'.
          format(eval_dir)))
     print('mean: {:.3f}'.format(np.mean(_list)))
     if check_significance(self.corpus, self.predictor):
         print('significant!')
     else:
         print('Not significant!')
Example #14
0
    def __set_paths(cls, corpus, predictor, agg):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""

        _base_dir = f'~/QppUqvProj/Results/{corpus}/uqvPredictions/'
        _base_dir = os.path.normpath(os.path.expanduser(_base_dir))
        cls.parameters = '{}/aggregated/{}/{}/evaluation/full_results_vector_for_2_folds_30_repetitions_{}.json'.format(
            _base_dir, agg, predictor, agg)
        cls.results_dir = '{}/raw/{}/predictions/'.format(_base_dir, predictor)
        cls.output_dir = '{}/aggregated/{}/{}/ltr/datasets/'.format(
            _base_dir, agg, predictor)
        ensure_dir(cls.output_dir)
        _test_dir = f'~/QppUqvProj/Results/{corpus}/test/'
        _test_dir = os.path.normpath(os.path.expanduser(_test_dir))
        cls.folds = '{}/2_folds_30_repetitions.json'.format(_test_dir)
        cls.features = '{}/raw/norm_features_{}_uqv.JSON'.format(
            _test_dir, corpus)
        cls.ap_file = '{}/aggregated/map1000-{}'.format(_test_dir, agg)
Example #15
0
    def __set_paths(cls, corpus):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        _corpus_test_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/test/')

        # AP file for the cross validation process
        cls.query_ap_file = dp.ensure_file(f'{_corpus_test_dir}/ref/QLmap1000-title')
        # CV folds mapping file
        cls.cv_map_file = dp.ensure_file(f'{_corpus_test_dir}/2_folds_30_repetitions.json')
        # The data dir for the Graphs
        cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data')
        # The results base dir for the Graphs
        cls.results_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/referenceLists/title/all_vars/general')
        cls.raw_res_base_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/uqvPredictions/referenceLists/title/all_vars/general')

        _ap_file = f'~/QppUqvProj/Results/{corpus}/test/basic/QLmap1000'
        cls.true_ap_file = dp.ensure_file(_ap_file)
Example #16
0
    def __set_graph_paths(cls, corpus, predictor, qgroup, direct, n):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        cls.predictor = predictor

        _corpus_res_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/uqvPredictions/')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}')
        _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data/{direct}')

        # Prediction results of all UQV query variants
        cls.vars_results_dir = dp.ensure_dir(
            f'{_corpus_res_dir}/raw/{predictor}/predictions/')

        # Prediction results of the queries to be predicted
        _orig_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/basicPredictions/{qgroup}')
        cls.base_results_dir = f'{_orig_dir}/{predictor}/predictions/'

        # The directory to save the new results
        cls.output_dir = dp.ensure_dir(
            f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars')

        # The files for used for the LTR and CV
        _test_dir = f'~/QppUqvProj/Results/{corpus}/test'
        cls.folds = dp.ensure_file(f'{_test_dir}/2_folds_30_repetitions.json')
        cls.ap_file = dp.ensure_file(f'{_test_dir}/ref/QLmap1000-{qgroup}')

        # The features file used for prediction
        cls.features = dp.ensure_file(
            f'{_graphs_dat_dir}/features/{qgroup}_query_{n}_variations_features_{corpus}_uqv.JSON'
        )

        cls.geo_mean_file = dp.ensure_file(
            f'QppUqvProj/Results/{corpus}/uqvPredictions/raw/geo/predictions/predictions-20000'
        )

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        cls.query_vars_file = dp.ensure_file(
            f'{_graphs_dat_dir}/queries/queries_wo_{qgroup}_{n}_vars.txt')
        cls.quantile_vars_file = cls.query_vars_file

        _queries2predict = f'~/QppUqvProj/data/{corpus}/queries_{corpus}_{qgroup}.txt'
        cls.queries2predict_file = dp.ensure_file(_queries2predict)

        cls.real_ap_file = dp.ensure_file(
            f'~/QppUqvProj/Results/{corpus}/test/raw/QLmap1000')

        cls.geo_predictions_dir = dp.ensure_dir(
            f'{_corpus_res_dir}/referenceLists/{qgroup}/all_vars/sim_as_pred/geo/predictions'
        )
Example #17
0
 def create_query_files(self, n):
     for direction, func in {('asce', filter_n_low_queries),
                             ('desc', filter_n_top_queries)}:
         _dir = dp.ensure_dir(f'{self.data_dir}/{direction}/queries')
         _file = f'{_dir}/queries_wo_{self.group}_{n}_vars.txt'
         _df = func(self.queries_obj.queries_df, self.raw_ap_obj, n)
         _df[['qid', 'text']].to_csv(_file,
                                     sep=":",
                                     header=False,
                                     index=False)
Example #18
0
def main():
    # corpus = 'ROBUST'
    corpus = 'ClueWeb12B'

    # raw_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}/uqvPredictions/raw/rsd/raw_data')
    raw_dir = dp.ensure_dir(
        f'~/QppUqvProj/Results/{corpus}/basicPredictions/title/rsd/raw_data')

    raw_files = glob(f'{raw_dir}/probabilities-*')
    for raw_file in raw_files:
        separate_tables(raw_file)
Example #19
0
    def __set_paths(cls, corpus, qgroup, vars_quantile):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        # cls.predictor = predictor
        _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _results_file = f'{_corpus_res_dir}/test/raw/QL.res'
        cls.results_file = os.path.normpath(_results_file)
        dp.ensure_file(cls.results_file)

        _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res'
        cls.title_res_file = os.path.normpath(_title_results_file)
        dp.ensure_file(cls.title_res_file)

        cls.queries_full_file = dp.ensure_file(
            f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt')

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _queries_variations_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_wo_{qgroup}.txt'
        cls.queries_variations_file = dp.ensure_file(_queries_variations_file)

        # The vars quantile file is used in the filter function - it consists of the relevant vars quantile
        if vars_quantile == 'all':
            _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.txt'
        else:
            _queries_quantile_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_{vars_quantile}_variants.txt'
        cls.queries_quantile_vars = dp.ensure_file(_queries_quantile_file)

        _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt'
        cls.queries_topic_file = dp.ensure_file(_queries_topic_file)

        _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res'
        cls.fused_results_file = dp.ensure_file(_fused_results_file)

        # cls.output_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/raw/')

        _predictions_out = f'{_corpus_res_dir}/uqvPredictions/referenceLists/{qgroup}/{vars_quantile}_vars/sim_as_pred/'
        cls.predictions_output_dir = dp.ensure_dir(_predictions_out)

        cls.pkl_dir = dp.ensure_dir(f'{_corpus_res_dir}/test/ref/pkl_files/')
Example #20
0
 def run_svm_fine_tune(self):
     models_dir = f'{self.output_dir}models'
     dp.ensure_dir(models_dir)
     classification_dir = f'{self.output_dir}classifications'
     dp.ensure_dir(classification_dir)
     dp.empty_dir(models_dir)
     dp.empty_dir(classification_dir)
     train_sets = glob.glob(f'{self.output_dir}datasets/train*')
     args_list = list(itertools.product(C_PARAMETERS, train_sets))
     if not mp.current_process().daemon:
         with mp.Pool(processes=self.cpu_cores) as pool:
             pool.starmap(
                 partial(svm_sub_procedure,
                         models_dir=models_dir,
                         classification_dir=classification_dir), args_list)
     else:
         for c, train_sets in args_list:
             svm_sub_procedure(c,
                               train_sets,
                               models_dir=models_dir,
                               classification_dir=classification_dir)
Example #21
0
    def save_predictions(self, df: pd.DataFrame):
        _df = self._filter_queries(df)
        _df = _df.groupby('topic').mean()
        _df = dp.convert_vid_to_qid(_df)
        _rboP_dir = dp.ensure_dir(
            f'{self.predictions_output_dir}/rboP/predictions')
        _FrboP_dir = dp.ensure_dir(
            f'{self.predictions_output_dir}/FrboP/predictions')
        _topDocsP_dir = dp.ensure_dir(
            f'{self.predictions_output_dir}/topDocsP/predictions')
        _jcP_dir = dp.ensure_dir(
            f'{self.predictions_output_dir}/jcP/predictions')

        _df[f'RBO_EXT_{self.rbo_top}'].to_csv(
            f'{_rboP_dir}/predictions-{self.rbo_top}', sep=' ')
        _df[f'RBO_FUSED_EXT_{self.rbo_top}'].to_csv(
            f'{_FrboP_dir}/predictions-{self.rbo_top}', sep=' ')
        _df[f'Top_{self.top_docs_overlap}_Docs_overlap'].to_csv(
            f'{_topDocsP_dir}/predictions-{self.top_docs_overlap}', sep=' ')
        _df['Jac_coefficient'].to_csv(f'{_jcP_dir}/predictions-{self.rbo_top}',
                                      sep=' ')
Example #22
0
 def generate_qpp_reference_predictions(self, predictor):
     print(f'\n---Generating qpp ref predictions with {predictor}---\n')
     for direct in {'asce', 'desc'}:
         _dir = dp.ensure_dir(f'{self.data_dir}/{direct}')
         for n in range(1, self.max_n + 1):
             qpp_ref = QueryPredictionRef(predictor,
                                          self.corpus,
                                          qgroup=self.group,
                                          vars_quantile='all',
                                          graphs=direct,
                                          n=n)
             qpp_ref.calc_queries()
Example #23
0
 def run_svm(self):
     c = '1'
     svm_learn = 'svmRank/svm_rank_learn'
     svm_classify = '~/svmRank/svm_rank_classify'
     models_dir = self.output_dir.replace('datasets', 'models')
     dp.ensure_dir(models_dir)
     classification_dir = self.output_dir.replace('datasets',
                                                  'classifications')
     run(f'rm -rfv {models_dir}*', shell=True)
     run(f'rm -rfv {classification_dir}*', shell=True)
     dp.ensure_dir(classification_dir)
     for set_id in range(1, 31):
         for subset in ['a', 'b']:
             run('{0} -c {1} {2}/train_{3}_{4}.dat {5}/model_{3}_{4}'.
                 format(svm_learn, c, self.output_dir, set_id, subset,
                        models_dir),
                 shell=True)
             run('{0} {1}/test_{2}_{3}.dat {4}/model_{2}_{3} {5}/predictions_{2}_{3}'
                 .format(svm_classify, self.output_dir, set_id, subset,
                         models_dir, classification_dir),
                 shell=True)
Example #24
0
 def generate_features(self, n):
     print(f'\n---Generating Features for {n} vars---\n')
     for direct in {'asce', 'desc'}:
         _dir = dp.ensure_dir(f'{self.data_dir}/{direct}/features')
         _feat_obj = RefQueryFeatureFactory(corpus=self.corpus,
                                            queries_group=self.group,
                                            vars_quantile='all',
                                            graphs=direct,
                                            n=n)
         _df = load_full_features_df(features_factory_obj=_feat_obj)
         _df.reset_index().to_json(
             f'{_dir}/{self.group}_query_{n}_variations_features_{self.corpus}_uqv.JSON'
         )
Example #25
0
 def calc_single_query_result(self, predictor):
     print(f'\n---Generating {predictor} 0 vars results---\n')
     _predictions_dir = dp.ensure_dir(
         f'{self.basic_predictions_dir}/{predictor}/predictions')
     cv_obj = InterTopicCrossValidation(k=2,
                                        rep=30,
                                        folds_map_file=self.cv_map_file,
                                        predictions_dir=_predictions_dir,
                                        load=True,
                                        ap_file=self.query_ap_file,
                                        test=self.corr_measure)
     mean = cv_obj.calc_test_results()
     self.basic_results_dict[predictor] = mean
Example #26
0
 def write_results(self, df, column, lambda_param, oracle=False):
     sim_func = get_simfunct(column)
     if sim_func != 'jac' and sim_func != 'uni' and sim_func != 'geo':
         sim_param = [s for s in column.split('_') if s.isdigit()][0]
     else:
         sim_param = None
     if oracle:
         output_dir = dp.ensure_dir(f'{self.output_dir}/oracle')
     else:
         output_dir = dp.ensure_dir(f'{self.output_dir}/general')
     for col in df.columns:
         _file_path = f'{output_dir}/{sim_func}/{self.predictor}/predictions/'
         dp.ensure_dir(_file_path)
         _file_name = col.replace('score_', 'predictions-')
         if sim_param:
             file_name = f'{_file_path}{_file_name}+{sim_func}+{sim_param}+lambda+{lambda_param}'
         else:
             file_name = f'{_file_path}{_file_name}+lambda+{lambda_param}'
         df[col].to_csv(file_name,
                        sep=" ",
                        header=False,
                        index=True,
                        float_format='%f')
Example #27
0
    def __set_paths(cls, corpus, group):
        _corpus_test_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/test/')

        # Basic predictions dir
        cls.basic_predictions_dir = dp.ensure_dir(
            f'~/QppUqvProj/Results/{corpus}/basicPredictions/{group}/')
        # AP file to pick variations according to AP
        cls.raw_ap_file = dp.ensure_file(f'{_corpus_test_dir}/raw/QLmap1000')
        # AP file for the cross validation process
        cls.query_ap_file = dp.ensure_file(
            f'{_corpus_test_dir}/ref/QLmap1000-{group}')
        # CV folds mapping file
        cls.cv_map_file = dp.ensure_file(
            f'{_corpus_test_dir}/2_folds_30_repetitions.json')
        # Queries file with all the variations except the ones to be predicted
        cls.queries_file = dp.ensure_file(
            f'~/QppUqvProj/data/{corpus}/queries_{corpus}_UQV_wo_{group}.txt')
        # The data dir for the Graphs
        cls.data_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}/data')
        # The results base dir for the Graphs
        cls.results_dir = dp.ensure_dir(
            f'~/QppUqvProj/Graphs/{corpus}/referenceLists/{group}')
Example #28
0
 def generate_sim_predictions(self, k):
     print(f'\n---Generating sim predictions {k} docs---\n')
     load_pickle = self.load_from_pkl
     for direct in {'asce', 'desc'}:
         _dir = dp.ensure_dir(f'{self.data_dir}/{direct}')
         for n in range(1, self.max_n + 1):
             sim_ref_pred = RefQueryFeatureFactory(self.corpus,
                                                   queries_group=self.group,
                                                   vars_quantile='all',
                                                   rbo_top=k,
                                                   top_docs_overlap=k,
                                                   graphs=direct,
                                                   n=n)
             sim_ref_pred.generate_predictions(load_pickle)
             load_pickle = True
Example #29
0
    def __set_graph_paths(cls, corpus, qgroup, direct, n):
        """This method sets the default paths of the files and the working directories, it assumes the standard naming
         convention of the project"""
        # cls.predictor = predictor
        _corpus_res_dir = dp.ensure_dir(f'~/QppUqvProj/Results/{corpus}')
        _corpus_dat_dir = dp.ensure_dir(f'~/QppUqvProj/data/{corpus}')

        _graphs_base_dir = dp.ensure_dir(f'~/QppUqvProj/Graphs/{corpus}')
        _graphs_res_dir = dp.ensure_dir(
            f'{_graphs_base_dir}/referenceLists/{qgroup}/{direct}/{n}_vars')
        _graphs_dat_dir = dp.ensure_dir(f'{_graphs_base_dir}/data')

        cls.number_of_vars = n

        _results_file = f'{_corpus_res_dir}/test/raw/QL.res'
        cls.results_file = os.path.normpath(_results_file)
        dp.ensure_file(cls.results_file)

        _title_results_file = f'{_corpus_res_dir}/test/basic/QL.res'
        cls.title_res_file = os.path.normpath(_title_results_file)
        dp.ensure_file(cls.title_res_file)

        _queries_full_file = f'{_corpus_dat_dir}/queries_{corpus}_UQV_full.stemmed.txt'
        cls.queries_full_file = dp.ensure_file(_queries_full_file)

        # The variations file is used in the filter function - it consists of all the vars w/o the query at hand
        _queries_variations_file = f'{_graphs_dat_dir}/{direct}/queries/queries_wo_{qgroup}_{n}_vars.txt'
        cls.queries_variations_file = dp.ensure_file(_queries_variations_file)
        cls.queries_quantile_vars = cls.queries_variations_file

        _queries_topic_file = f'{_corpus_dat_dir}/queries_{corpus}_{qgroup}.stemmed.txt'
        cls.queries_topic_file = dp.ensure_file(_queries_topic_file)

        _fused_results_file = f'{_corpus_res_dir}/test/fusion/QL.res'
        # _fused_results_file = f'{_corpus_res_dir}/test/fusion/all_wo_{qgroup}_fused_QL.res'
        cls.fused_results_file = dp.ensure_file(_fused_results_file)

        # cls.output_dir = dp.ensure_dir(f'{_graphs_res_dir}/test/raw/')

        cls.predictions_output_dir = dp.ensure_dir(
            f'{_graphs_res_dir}/sim_as_pred/')

        cls.pkl_dir = dp.ensure_dir(f'{_graphs_dat_dir}/pkl_files/features')
Example #30
0
def separate_tables(raw_file):
    raw_df = pd.read_table(raw_file,
                           names=['qid', 'term', 'probability', 'clipping'],
                           sep=' ')
    full_rm_df = raw_df.loc[raw_df['clipping'].isna()].drop('clipping', axis=1)
    clipped_df = raw_df.loc[~raw_df['clipping'].isna()].drop('clipping',
                                                             axis=1)

    raw_file = raw_file.rsplit('/', 1)
    _file_name = raw_file[-1]
    _dir = dp.ensure_dir(raw_file[0].replace('raw_data', 'data'))
    fullrm_file = f'{_file_name}+c0'
    clipped_file = f'{_file_name}+c100'

    clipped_df.to_csv(f'{_dir}/{clipped_file}',
                      sep=" ",
                      header=False,
                      index=False,
                      float_format='%f')
    full_rm_df.to_csv(f'{_dir}/{fullrm_file}',
                      sep=" ",
                      header=False,
                      index=False,
                      float_format='%f')