Example #1
0
    def run_kfold_cross_validation(cls,
                                   train_test_model_class,
                                   model_param,
                                   results_or_df,
                                   kfold,
                                   logger=None):
        """
        Standard k-fold cross validation, given hyper-parameter set model_param
        :param train_test_model_class:
        :param model_param:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        list of indices, then each list contains row indices of the dataframe
        selected as one fold
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold+1)*fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 2, 'kfold list must have length >= 2 for k-fold ' \
                                'cross validation.'

        statss = []
        models = []
        contentids = []

        for fold in range(len(kfold)):

            if logger: logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]

            output = cls.run_cross_validation(train_test_model_class,
                                              model_param,
                                              results_or_df,
                                              train_index_range,
                                              test_index_range)

            stats = output['stats']
            model = output['model']

            statss.append(stats)
            models.append(model)

            contentids += list(output['contentids'])

        aggr_stats = TrainTestModel.aggregate_stats_list(statss)

        output = {}
        output['aggr_stats'] = aggr_stats
        output['statss'] = statss
        output['models'] = models

        assert contentids is not None
        output['contentids'] = contentids

        return output
Example #2
0
    def run_nested_kfold_cross_validation(cls,
                                          train_test_model_class,
                                          model_param_search_range,
                                          results_or_df,
                                          kfold,
                                          search_strategy='grid',
                                          random_search_times=100,
                                          logger=None):
        """
        Nested k-fold cross validation, given hyper-parameter search range. The
        search range is specified in the format of, e.g.:
        {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'],
         'n_estimators':[10, 50],
         'random_state': [0]}
        :param train_test_model_class:
        :param model_param_search_range:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        lists of indices, then each list contains row indices of the dataframe
        selected as one fold
        :param search_strategy: either 'grid' or 'random'
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold+1)*fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \
                                'k-fold cross validation.'

        if search_strategy == 'grid':
            cls._assert_grid_search(model_param_search_range)
            list_model_param = cls._unroll_dict_of_lists(
                model_param_search_range)
        elif search_strategy == 'random':
            cls._assert_random_search(model_param_search_range)
            list_model_param = cls._sample_model_param_list(
                model_param_search_range, random_search_times)
        else:
            assert False, "Unknown search_strategy: {}".format(search_strategy)

        statss = []
        model_params = []
        contentids = []

        for fold in range(len(kfold)):

            if logger: logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            train_index_range_in_list_of_indices = []
                        # in this case, train_index_range is list of lists
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]
                    train_index_range_in_list_of_indices.append(kfold[train_fold])

            # iterate through all possible combinations of model_params
            best_model_param = None
            best_stats = None
            for model_param in list_model_param:

                if logger: logger.info("\tModel parameter: {}".format(model_param))

                output = \
                    cls.run_kfold_cross_validation(train_test_model_class,
                                                   model_param,
                                                   results_or_df,
                                                   train_index_range_in_list_of_indices)
                stats = output['aggr_stats']

                if (best_stats is None) or (
                    TrainTestModel.get_objective_score(stats, type='SRCC')
                    >
                    TrainTestModel.get_objective_score(best_stats, type='SRCC')
                ):
                    best_stats = stats
                    best_model_param = model_param

            # run cross validation based on best model parameters
            output_ = cls.run_cross_validation(train_test_model_class,
                                              best_model_param,
                                              results_or_df,
                                              train_index_range,
                                              test_index_range)
            stats_ = output_['stats']

            statss.append(stats_)
            model_params.append(best_model_param)

            contentids += list(output_['contentids'])

        aggr_stats = TrainTestModel.aggregate_stats_list(statss)
        top_model_param, count = cls._find_most_frequent_dict(model_params)

        assert contentids is not None
        output__ = {
            'aggr_stats':aggr_stats,
            'top_model_param':top_model_param,
            'top_ratio':float(count) / len(model_params),
            'statss':statss,
            'model_params':model_params,
            'contentids':contentids,
        }

        return output__